1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/media/libvpx/vp9/common/x86/vp9_intrapred_ssse3.asm Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,1036 @@ 1.4 +; 1.5 +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 1.6 +; 1.7 +; Use of this source code is governed by a BSD-style license 1.8 +; that can be found in the LICENSE file in the root of the source 1.9 +; tree. An additional intellectual property rights grant can be found 1.10 +; in the file PATENTS. All contributing project authors may 1.11 +; be found in the AUTHORS file in the root of the source tree. 1.12 +; 1.13 + 1.14 +%include "third_party/x86inc/x86inc.asm" 1.15 + 1.16 +SECTION_RODATA 1.17 + 1.18 +pb_1: times 16 db 1 1.19 +sh_b01234577: db 0, 1, 2, 3, 4, 5, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0 1.20 +sh_b12345677: db 1, 2, 3, 4, 5, 6, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0 1.21 +sh_b23456777: db 2, 3, 4, 5, 6, 7, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0 1.22 +sh_b0123456777777777: db 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7 1.23 +sh_b1234567777777777: db 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7 1.24 +sh_b2345677777777777: db 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7 1.25 +sh_b123456789abcdeff: db 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15 1.26 +sh_b23456789abcdefff: db 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 15 1.27 +sh_b32104567: db 3, 2, 1, 0, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0 1.28 +sh_b8091a2b345: db 8, 0, 9, 1, 10, 2, 11, 3, 4, 5, 0, 0, 0, 0, 0, 0 1.29 +sh_b76543210: db 7, 6, 5, 4, 3, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0 1.30 +sh_b65432108: db 6, 5, 4, 3, 2, 1, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0 1.31 +sh_b54321089: db 5, 4, 3, 2, 1, 0, 8, 9, 0, 0, 0, 0, 0, 0, 0, 0 1.32 +sh_b89abcdef: db 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0 1.33 +sh_bfedcba9876543210: db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 1.34 +sh_b1233: db 1, 2, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 1.35 +sh_b2333: db 2, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 1.36 + 1.37 +SECTION .text 1.38 + 1.39 +INIT_MMX ssse3 1.40 +cglobal h_predictor_4x4, 2, 4, 3, dst, stride, line, left 1.41 + movifnidn leftq, leftmp 1.42 + add leftq, 4 1.43 + mov lineq, -2 1.44 + pxor m0, m0 1.45 +.loop: 1.46 + movd m1, [leftq+lineq*2 ] 1.47 + movd m2, [leftq+lineq*2+1] 1.48 + pshufb m1, m0 1.49 + pshufb m2, m0 1.50 + movd [dstq ], m1 1.51 + movd [dstq+strideq], m2 1.52 + lea dstq, [dstq+strideq*2] 1.53 + inc lineq 1.54 + jnz .loop 1.55 + REP_RET 1.56 + 1.57 +INIT_MMX ssse3 1.58 +cglobal h_predictor_8x8, 2, 4, 3, dst, stride, line, left 1.59 + movifnidn leftq, leftmp 1.60 + add leftq, 8 1.61 + mov lineq, -4 1.62 + pxor m0, m0 1.63 +.loop: 1.64 + movd m1, [leftq+lineq*2 ] 1.65 + movd m2, [leftq+lineq*2+1] 1.66 + pshufb m1, m0 1.67 + pshufb m2, m0 1.68 + movq [dstq ], m1 1.69 + movq [dstq+strideq], m2 1.70 + lea dstq, [dstq+strideq*2] 1.71 + inc lineq 1.72 + jnz .loop 1.73 + REP_RET 1.74 + 1.75 +INIT_XMM ssse3 1.76 +cglobal h_predictor_16x16, 2, 4, 3, dst, stride, line, left 1.77 + movifnidn leftq, leftmp 1.78 + add leftq, 16 1.79 + mov lineq, -8 1.80 + pxor m0, m0 1.81 +.loop: 1.82 + movd m1, [leftq+lineq*2 ] 1.83 + movd m2, [leftq+lineq*2+1] 1.84 + pshufb m1, m0 1.85 + pshufb m2, m0 1.86 + mova [dstq ], m1 1.87 + mova [dstq+strideq], m2 1.88 + lea dstq, [dstq+strideq*2] 1.89 + inc lineq 1.90 + jnz .loop 1.91 + REP_RET 1.92 + 1.93 +INIT_XMM ssse3 1.94 +cglobal h_predictor_32x32, 2, 4, 3, dst, stride, line, left 1.95 + movifnidn leftq, leftmp 1.96 + add leftq, 32 1.97 + mov lineq, -16 1.98 + pxor m0, m0 1.99 +.loop: 1.100 + movd m1, [leftq+lineq*2 ] 1.101 + movd m2, [leftq+lineq*2+1] 1.102 + pshufb m1, m0 1.103 + pshufb m2, m0 1.104 + mova [dstq ], m1 1.105 + mova [dstq +16], m1 1.106 + mova [dstq+strideq ], m2 1.107 + mova [dstq+strideq+16], m2 1.108 + lea dstq, [dstq+strideq*2] 1.109 + inc lineq 1.110 + jnz .loop 1.111 + REP_RET 1.112 + 1.113 +INIT_MMX ssse3 1.114 +cglobal d45_predictor_4x4, 3, 4, 4, dst, stride, above, goffset 1.115 + GET_GOT goffsetq 1.116 + 1.117 + movq m0, [aboveq] 1.118 + pshufb m2, m0, [GLOBAL(sh_b23456777)] 1.119 + pshufb m1, m0, [GLOBAL(sh_b01234577)] 1.120 + pshufb m0, [GLOBAL(sh_b12345677)] 1.121 + pavgb m3, m2, m1 1.122 + pxor m2, m1 1.123 + pand m2, [GLOBAL(pb_1)] 1.124 + psubb m3, m2 1.125 + pavgb m0, m3 1.126 + 1.127 + ; store 4 lines 1.128 + movd [dstq ], m0 1.129 + psrlq m0, 8 1.130 + movd [dstq+strideq], m0 1.131 + lea dstq, [dstq+strideq*2] 1.132 + psrlq m0, 8 1.133 + movd [dstq ], m0 1.134 + psrlq m0, 8 1.135 + movd [dstq+strideq], m0 1.136 + 1.137 + RESTORE_GOT 1.138 + RET 1.139 + 1.140 +INIT_MMX ssse3 1.141 +cglobal d45_predictor_8x8, 3, 4, 4, dst, stride, above, goffset 1.142 + GET_GOT goffsetq 1.143 + 1.144 + movq m0, [aboveq] 1.145 + mova m1, [GLOBAL(sh_b12345677)] 1.146 + DEFINE_ARGS dst, stride, stride3 1.147 + lea stride3q, [strideq*3] 1.148 + pshufb m2, m0, [GLOBAL(sh_b23456777)] 1.149 + pavgb m3, m2, m0 1.150 + pxor m2, m0 1.151 + pshufb m0, m1 1.152 + pand m2, [GLOBAL(pb_1)] 1.153 + psubb m3, m2 1.154 + pavgb m0, m3 1.155 + 1.156 + ; store 4 lines 1.157 + movq [dstq ], m0 1.158 + pshufb m0, m1 1.159 + movq [dstq+strideq ], m0 1.160 + pshufb m0, m1 1.161 + movq [dstq+strideq*2], m0 1.162 + pshufb m0, m1 1.163 + movq [dstq+stride3q ], m0 1.164 + pshufb m0, m1 1.165 + lea dstq, [dstq+strideq*4] 1.166 + 1.167 + ; store next 4 lines 1.168 + movq [dstq ], m0 1.169 + pshufb m0, m1 1.170 + movq [dstq+strideq ], m0 1.171 + pshufb m0, m1 1.172 + movq [dstq+strideq*2], m0 1.173 + pshufb m0, m1 1.174 + movq [dstq+stride3q ], m0 1.175 + 1.176 + RESTORE_GOT 1.177 + RET 1.178 + 1.179 +INIT_XMM ssse3 1.180 +cglobal d45_predictor_16x16, 3, 6, 4, dst, stride, above, dst8, line, goffset 1.181 + GET_GOT goffsetq 1.182 + 1.183 + mova m0, [aboveq] 1.184 + DEFINE_ARGS dst, stride, stride3, dst8, line 1.185 + lea stride3q, [strideq*3] 1.186 + lea dst8q, [dstq+strideq*8] 1.187 + mova m1, [GLOBAL(sh_b123456789abcdeff)] 1.188 + pshufb m2, m0, [GLOBAL(sh_b23456789abcdefff)] 1.189 + pavgb m3, m2, m0 1.190 + pxor m2, m0 1.191 + pshufb m0, m1 1.192 + pand m2, [GLOBAL(pb_1)] 1.193 + psubb m3, m2 1.194 + pavgb m0, m3 1.195 + 1.196 + ; first 4 lines and first half of 3rd 4 lines 1.197 + mov lined, 2 1.198 +.loop: 1.199 + mova [dstq ], m0 1.200 + movhps [dst8q ], m0 1.201 + pshufb m0, m1 1.202 + mova [dstq +strideq ], m0 1.203 + movhps [dst8q+strideq ], m0 1.204 + pshufb m0, m1 1.205 + mova [dstq +strideq*2 ], m0 1.206 + movhps [dst8q+strideq*2 ], m0 1.207 + pshufb m0, m1 1.208 + mova [dstq +stride3q ], m0 1.209 + movhps [dst8q+stride3q ], m0 1.210 + pshufb m0, m1 1.211 + lea dstq, [dstq +strideq*4] 1.212 + lea dst8q, [dst8q+strideq*4] 1.213 + dec lined 1.214 + jnz .loop 1.215 + 1.216 + ; bottom-right 8x8 block 1.217 + movhps [dstq +8], m0 1.218 + movhps [dstq+strideq +8], m0 1.219 + movhps [dstq+strideq*2+8], m0 1.220 + movhps [dstq+stride3q +8], m0 1.221 + lea dstq, [dstq+strideq*4] 1.222 + movhps [dstq +8], m0 1.223 + movhps [dstq+strideq +8], m0 1.224 + movhps [dstq+strideq*2+8], m0 1.225 + movhps [dstq+stride3q +8], m0 1.226 + 1.227 + RESTORE_GOT 1.228 + RET 1.229 + 1.230 +INIT_XMM ssse3 1.231 +cglobal d45_predictor_32x32, 3, 6, 7, dst, stride, above, dst16, line, goffset 1.232 + GET_GOT goffsetq 1.233 + 1.234 + mova m0, [aboveq] 1.235 + mova m4, [aboveq+16] 1.236 + DEFINE_ARGS dst, stride, stride3, dst16, line 1.237 + lea stride3q, [strideq*3] 1.238 + lea dst16q, [dstq +strideq*8] 1.239 + lea dst16q, [dst16q+strideq*8] 1.240 + mova m1, [GLOBAL(sh_b123456789abcdeff)] 1.241 + pshufb m2, m4, [GLOBAL(sh_b23456789abcdefff)] 1.242 + pavgb m3, m2, m4 1.243 + pxor m2, m4 1.244 + palignr m5, m4, m0, 1 1.245 + palignr m6, m4, m0, 2 1.246 + pshufb m4, m1 1.247 + pand m2, [GLOBAL(pb_1)] 1.248 + psubb m3, m2 1.249 + pavgb m4, m3 1.250 + pavgb m3, m0, m6 1.251 + pxor m0, m6 1.252 + pand m0, [GLOBAL(pb_1)] 1.253 + psubb m3, m0 1.254 + pavgb m5, m3 1.255 + 1.256 + ; write 4x4 lines (and the first half of the second 4x4 lines) 1.257 + mov lined, 4 1.258 +.loop: 1.259 + mova [dstq ], m5 1.260 + mova [dstq +16], m4 1.261 + mova [dst16q ], m4 1.262 + palignr m3, m4, m5, 1 1.263 + pshufb m4, m1 1.264 + mova [dstq +strideq ], m3 1.265 + mova [dstq +strideq +16], m4 1.266 + mova [dst16q+strideq ], m4 1.267 + palignr m5, m4, m3, 1 1.268 + pshufb m4, m1 1.269 + mova [dstq +strideq*2 ], m5 1.270 + mova [dstq +strideq*2+16], m4 1.271 + mova [dst16q+strideq*2 ], m4 1.272 + palignr m3, m4, m5, 1 1.273 + pshufb m4, m1 1.274 + mova [dstq +stride3q ], m3 1.275 + mova [dstq +stride3q +16], m4 1.276 + mova [dst16q+stride3q ], m4 1.277 + palignr m5, m4, m3, 1 1.278 + pshufb m4, m1 1.279 + lea dstq, [dstq +strideq*4] 1.280 + lea dst16q, [dst16q+strideq*4] 1.281 + dec lined 1.282 + jnz .loop 1.283 + 1.284 + ; write second half of second 4x4 lines 1.285 + mova [dstq +16], m4 1.286 + mova [dstq +strideq +16], m4 1.287 + mova [dstq +strideq*2+16], m4 1.288 + mova [dstq +stride3q +16], m4 1.289 + lea dstq, [dstq +strideq*4] 1.290 + mova [dstq +16], m4 1.291 + mova [dstq +strideq +16], m4 1.292 + mova [dstq +strideq*2+16], m4 1.293 + mova [dstq +stride3q +16], m4 1.294 + lea dstq, [dstq +strideq*4] 1.295 + mova [dstq +16], m4 1.296 + mova [dstq +strideq +16], m4 1.297 + mova [dstq +strideq*2+16], m4 1.298 + mova [dstq +stride3q +16], m4 1.299 + lea dstq, [dstq +strideq*4] 1.300 + mova [dstq +16], m4 1.301 + mova [dstq +strideq +16], m4 1.302 + mova [dstq +strideq*2+16], m4 1.303 + mova [dstq +stride3q +16], m4 1.304 + 1.305 + RESTORE_GOT 1.306 + RET 1.307 + 1.308 +; ------------------------------------------ 1.309 +; input: x, y, z, result 1.310 +; 1.311 +; trick from pascal 1.312 +; (x+2y+z+2)>>2 can be calculated as: 1.313 +; result = avg(x,z) 1.314 +; result -= xor(x,z) & 1 1.315 +; result = avg(result,y) 1.316 +; ------------------------------------------ 1.317 +%macro X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 4 1.318 + pavgb %4, %1, %3 1.319 + pxor %3, %1 1.320 + pand %3, [GLOBAL(pb_1)] 1.321 + psubb %4, %3 1.322 + pavgb %4, %2 1.323 +%endmacro 1.324 + 1.325 +INIT_XMM ssse3 1.326 +cglobal d63_predictor_4x4, 3, 4, 5, dst, stride, above, goffset 1.327 + GET_GOT goffsetq 1.328 + 1.329 + movq m3, [aboveq] 1.330 + pshufb m1, m3, [GLOBAL(sh_b23456777)] 1.331 + pshufb m2, m3, [GLOBAL(sh_b12345677)] 1.332 + 1.333 + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m3, m2, m1, m4 1.334 + pavgb m3, m2 1.335 + 1.336 + ; store 4 lines 1.337 + movd [dstq ], m3 1.338 + movd [dstq+strideq], m4 1.339 + lea dstq, [dstq+strideq*2] 1.340 + psrldq m3, 1 1.341 + psrldq m4, 1 1.342 + movd [dstq ], m3 1.343 + movd [dstq+strideq], m4 1.344 + RESTORE_GOT 1.345 + RET 1.346 + 1.347 +INIT_XMM ssse3 1.348 +cglobal d63_predictor_8x8, 3, 4, 5, dst, stride, above, goffset 1.349 + GET_GOT goffsetq 1.350 + 1.351 + movq m3, [aboveq] 1.352 + DEFINE_ARGS dst, stride, stride3 1.353 + lea stride3q, [strideq*3] 1.354 + pshufb m1, m3, [GLOBAL(sh_b2345677777777777)] 1.355 + pshufb m0, m3, [GLOBAL(sh_b0123456777777777)] 1.356 + pshufb m2, m3, [GLOBAL(sh_b1234567777777777)] 1.357 + pshufb m3, [GLOBAL(sh_b0123456777777777)] 1.358 + 1.359 + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m2, m1, m4 1.360 + pavgb m3, m2 1.361 + 1.362 + ; store 4 lines 1.363 + movq [dstq ], m3 1.364 + movq [dstq+strideq], m4 1.365 + psrldq m3, 1 1.366 + psrldq m4, 1 1.367 + movq [dstq+strideq*2], m3 1.368 + movq [dstq+stride3q ], m4 1.369 + lea dstq, [dstq+strideq*4] 1.370 + psrldq m3, 1 1.371 + psrldq m4, 1 1.372 + 1.373 + ; store 4 lines 1.374 + movq [dstq ], m3 1.375 + movq [dstq+strideq], m4 1.376 + psrldq m3, 1 1.377 + psrldq m4, 1 1.378 + movq [dstq+strideq*2], m3 1.379 + movq [dstq+stride3q ], m4 1.380 + RESTORE_GOT 1.381 + RET 1.382 + 1.383 +INIT_XMM ssse3 1.384 +cglobal d63_predictor_16x16, 3, 5, 5, dst, stride, above, line, goffset 1.385 + GET_GOT goffsetq 1.386 + 1.387 + mova m0, [aboveq] 1.388 + DEFINE_ARGS dst, stride, stride3, line 1.389 + lea stride3q, [strideq*3] 1.390 + mova m1, [GLOBAL(sh_b123456789abcdeff)] 1.391 + pshufb m2, m0, [GLOBAL(sh_b23456789abcdefff)] 1.392 + pshufb m3, m0, m1 1.393 + 1.394 + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m3, m2, m4 1.395 + pavgb m0, m3 1.396 + 1.397 + mov lined, 4 1.398 +.loop: 1.399 + mova [dstq ], m0 1.400 + mova [dstq+strideq ], m4 1.401 + pshufb m0, m1 1.402 + pshufb m4, m1 1.403 + mova [dstq+strideq*2], m0 1.404 + mova [dstq+stride3q ], m4 1.405 + pshufb m0, m1 1.406 + pshufb m4, m1 1.407 + lea dstq, [dstq+strideq*4] 1.408 + dec lined 1.409 + jnz .loop 1.410 + RESTORE_GOT 1.411 + REP_RET 1.412 + 1.413 +INIT_XMM ssse3 1.414 +cglobal d63_predictor_32x32, 3, 5, 8, dst, stride, above, line, goffset 1.415 + GET_GOT goffsetq 1.416 + 1.417 + mova m0, [aboveq] 1.418 + mova m7, [aboveq+16] 1.419 + DEFINE_ARGS dst, stride, stride3, line 1.420 + mova m1, [GLOBAL(sh_b123456789abcdeff)] 1.421 + lea stride3q, [strideq*3] 1.422 + pshufb m2, m7, [GLOBAL(sh_b23456789abcdefff)] 1.423 + pshufb m3, m7, m1 1.424 + 1.425 + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m7, m3, m2, m4 1.426 + palignr m6, m7, m0, 1 1.427 + palignr m5, m7, m0, 2 1.428 + pavgb m7, m3 1.429 + 1.430 + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m6, m5, m2 1.431 + pavgb m0, m6 1.432 + 1.433 + mov lined, 8 1.434 +.loop: 1.435 + mova [dstq ], m0 1.436 + mova [dstq +16], m7 1.437 + mova [dstq+strideq ], m2 1.438 + mova [dstq+strideq +16], m4 1.439 + palignr m3, m7, m0, 1 1.440 + palignr m5, m4, m2, 1 1.441 + pshufb m7, m1 1.442 + pshufb m4, m1 1.443 + 1.444 + mova [dstq+strideq*2 ], m3 1.445 + mova [dstq+strideq*2+16], m7 1.446 + mova [dstq+stride3q ], m5 1.447 + mova [dstq+stride3q +16], m4 1.448 + palignr m0, m7, m3, 1 1.449 + palignr m2, m4, m5, 1 1.450 + pshufb m7, m1 1.451 + pshufb m4, m1 1.452 + lea dstq, [dstq+strideq*4] 1.453 + dec lined 1.454 + jnz .loop 1.455 + RESTORE_GOT 1.456 + REP_RET 1.457 + 1.458 +INIT_XMM ssse3 1.459 +cglobal d153_predictor_4x4, 4, 5, 4, dst, stride, above, left, goffset 1.460 + GET_GOT goffsetq 1.461 + movd m0, [leftq] ; l1, l2, l3, l4 1.462 + movd m1, [aboveq-1] ; tl, t1, t2, t3 1.463 + punpckldq m0, m1 ; l1, l2, l3, l4, tl, t1, t2, t3 1.464 + pshufb m0, [GLOBAL(sh_b32104567)]; l4, l3, l2, l1, tl, t1, t2, t3 1.465 + psrldq m1, m0, 1 ; l3, l2, l1, tl, t1, t2, t3 1.466 + psrldq m2, m0, 2 ; l2, l1, tl, t1, t2, t3 1.467 + ; comments below are for a predictor like this 1.468 + ; A1 B1 C1 D1 1.469 + ; A2 B2 A1 B1 1.470 + ; A3 B3 A2 B2 1.471 + ; A4 B4 A3 B3 1.472 + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3 ; 3-tap avg B4 B3 B2 B1 C1 D1 1.473 + pavgb m1, m0 ; 2-tap avg A4 A3 A2 A1 1.474 + 1.475 + punpcklqdq m3, m1 ; B4 B3 B2 B1 C1 D1 x x A4 A3 A2 A1 .. 1.476 + 1.477 + DEFINE_ARGS dst, stride, stride3 1.478 + lea stride3q, [strideq*3] 1.479 + pshufb m3, [GLOBAL(sh_b8091a2b345)] ; A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 .. 1.480 + movd [dstq+stride3q ], m3 1.481 + psrldq m3, 2 ; A3 B3 A2 B2 A1 B1 C1 D1 .. 1.482 + movd [dstq+strideq*2], m3 1.483 + psrldq m3, 2 ; A2 B2 A1 B1 C1 D1 .. 1.484 + movd [dstq+strideq ], m3 1.485 + psrldq m3, 2 ; A1 B1 C1 D1 .. 1.486 + movd [dstq ], m3 1.487 + RESTORE_GOT 1.488 + RET 1.489 + 1.490 +INIT_XMM ssse3 1.491 +cglobal d153_predictor_8x8, 4, 5, 8, dst, stride, above, left, goffset 1.492 + GET_GOT goffsetq 1.493 + movq m0, [leftq] ; [0- 7] l1-8 [byte] 1.494 + movhps m0, [aboveq-1] ; [8-15] tl, t1-7 [byte] 1.495 + pshufb m1, m0, [GLOBAL(sh_b76543210)] ; l8-1 [word] 1.496 + pshufb m2, m0, [GLOBAL(sh_b65432108)] ; l7-1,tl [word] 1.497 + pshufb m3, m0, [GLOBAL(sh_b54321089)] ; l6-1,tl,t1 [word] 1.498 + pshufb m0, [GLOBAL(sh_b89abcdef)] ; tl,t1-7 [word] 1.499 + psrldq m4, m0, 1 ; t1-7 [word] 1.500 + psrldq m5, m0, 2 ; t2-7 [word] 1.501 + ; comments below are for a predictor like this 1.502 + ; A1 B1 C1 D1 E1 F1 G1 H1 1.503 + ; A2 B2 A1 B1 C1 D1 E1 F1 1.504 + ; A3 B3 A2 B2 A1 B1 C1 D1 1.505 + ; A4 B4 A3 B3 A2 B2 A1 B1 1.506 + ; A5 B5 A4 B4 A3 B3 A2 B2 1.507 + ; A6 B6 A5 B5 A4 B4 A3 B3 1.508 + ; A7 B7 A6 B6 A5 B5 A4 B4 1.509 + ; A8 B8 A7 B7 A6 B6 A5 B5 1.510 + pavgb m6, m1, m2 ; 2-tap avg A8-A1 1.511 + 1.512 + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m4, m5, m7 ; 3-tap avg C-H1 1.513 + 1.514 + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m1, m2, m3, m0 ; 3-tap avg B8-1 1.515 + 1.516 + punpcklbw m6, m0 ; A-B8, A-B7 ... A-B2, A-B1 1.517 + 1.518 + DEFINE_ARGS dst, stride, stride3 1.519 + lea stride3q, [strideq*3] 1.520 + 1.521 + movhps [dstq+stride3q], m6 ; A-B4, A-B3, A-B2, A-B1 1.522 + palignr m0, m7, m6, 10 ; A-B3, A-B2, A-B1, C-H1 1.523 + movq [dstq+strideq*2], m0 1.524 + psrldq m0, 2 ; A-B2, A-B1, C-H1 1.525 + movq [dstq+strideq ], m0 1.526 + psrldq m0, 2 ; A-H1 1.527 + movq [dstq ], m0 1.528 + lea dstq, [dstq+strideq*4] 1.529 + movq [dstq+stride3q ], m6 ; A-B8, A-B7, A-B6, A-B5 1.530 + psrldq m6, 2 ; A-B7, A-B6, A-B5, A-B4 1.531 + movq [dstq+strideq*2], m6 1.532 + psrldq m6, 2 ; A-B6, A-B5, A-B4, A-B3 1.533 + movq [dstq+strideq ], m6 1.534 + psrldq m6, 2 ; A-B5, A-B4, A-B3, A-B2 1.535 + movq [dstq ], m6 1.536 + RESTORE_GOT 1.537 + RET 1.538 + 1.539 +INIT_XMM ssse3 1.540 +cglobal d153_predictor_16x16, 4, 5, 8, dst, stride, above, left, goffset 1.541 + GET_GOT goffsetq 1.542 + mova m0, [leftq] 1.543 + movu m7, [aboveq-1] 1.544 + ; comments below are for a predictor like this 1.545 + ; A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1 L1 M1 N1 O1 P1 1.546 + ; A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1 L1 M1 N1 1.547 + ; A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1 L1 1.548 + ; A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 1.549 + ; A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 1.550 + ; A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 1.551 + ; A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 1.552 + ; A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 1.553 + ; A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 1.554 + ; Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 1.555 + ; Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 1.556 + ; Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 1.557 + ; Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 1.558 + ; Ae Be Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 1.559 + ; Af Bf Ae Be Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 1.560 + ; Ag Bg Af Bf Ae Be Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 1.561 + pshufb m6, m7, [GLOBAL(sh_bfedcba9876543210)] 1.562 + palignr m5, m0, m6, 15 1.563 + palignr m3, m0, m6, 14 1.564 + 1.565 + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m5, m3, m4 ; 3-tap avg B3-Bg 1.566 + pshufb m1, m0, [GLOBAL(sh_b123456789abcdeff)] 1.567 + pavgb m5, m0 ; A1 - Ag 1.568 + 1.569 + punpcklbw m0, m4, m5 ; A-B8 ... A-B1 1.570 + punpckhbw m4, m5 ; A-B9 ... A-Bg 1.571 + 1.572 + pshufb m3, m7, [GLOBAL(sh_b123456789abcdeff)] 1.573 + pshufb m5, m7, [GLOBAL(sh_b23456789abcdefff)] 1.574 + 1.575 + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m7, m3, m5, m1 ; 3-tap avg C1-P1 1.576 + 1.577 + pshufb m6, m0, [GLOBAL(sh_bfedcba9876543210)] 1.578 + DEFINE_ARGS dst, stride, stride3 1.579 + lea stride3q, [strideq*3] 1.580 + palignr m2, m1, m6, 14 1.581 + mova [dstq ], m2 1.582 + palignr m2, m1, m6, 12 1.583 + mova [dstq+strideq ], m2 1.584 + palignr m2, m1, m6, 10 1.585 + mova [dstq+strideq*2], m2 1.586 + palignr m2, m1, m6, 8 1.587 + mova [dstq+stride3q ], m2 1.588 + lea dstq, [dstq+strideq*4] 1.589 + palignr m2, m1, m6, 6 1.590 + mova [dstq ], m2 1.591 + palignr m2, m1, m6, 4 1.592 + mova [dstq+strideq ], m2 1.593 + palignr m2, m1, m6, 2 1.594 + mova [dstq+strideq*2], m2 1.595 + pshufb m4, [GLOBAL(sh_bfedcba9876543210)] 1.596 + mova [dstq+stride3q ], m6 1.597 + lea dstq, [dstq+strideq*4] 1.598 + 1.599 + palignr m2, m6, m4, 14 1.600 + mova [dstq ], m2 1.601 + palignr m2, m6, m4, 12 1.602 + mova [dstq+strideq ], m2 1.603 + palignr m2, m6, m4, 10 1.604 + mova [dstq+strideq*2], m2 1.605 + palignr m2, m6, m4, 8 1.606 + mova [dstq+stride3q ], m2 1.607 + lea dstq, [dstq+strideq*4] 1.608 + palignr m2, m6, m4, 6 1.609 + mova [dstq ], m2 1.610 + palignr m2, m6, m4, 4 1.611 + mova [dstq+strideq ], m2 1.612 + palignr m2, m6, m4, 2 1.613 + mova [dstq+strideq*2], m2 1.614 + mova [dstq+stride3q ], m4 1.615 + RESTORE_GOT 1.616 + RET 1.617 + 1.618 +INIT_XMM ssse3 1.619 +cglobal d153_predictor_32x32, 4, 5, 8, dst, stride, above, left, goffset 1.620 + GET_GOT goffsetq 1.621 + mova m0, [leftq] 1.622 + movu m7, [aboveq-1] 1.623 + movu m1, [aboveq+15] 1.624 + 1.625 + pshufb m4, m1, [GLOBAL(sh_b123456789abcdeff)] 1.626 + pshufb m6, m1, [GLOBAL(sh_b23456789abcdefff)] 1.627 + 1.628 + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m1, m4, m6, m2 ; 3-tap avg above [high] 1.629 + 1.630 + palignr m3, m1, m7, 1 1.631 + palignr m5, m1, m7, 2 1.632 + 1.633 + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m7, m3, m5, m1 ; 3-tap avg above [low] 1.634 + 1.635 + pshufb m7, [GLOBAL(sh_bfedcba9876543210)] 1.636 + palignr m5, m0, m7, 15 1.637 + palignr m3, m0, m7, 14 1.638 + 1.639 + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m5, m3, m4 ; 3-tap avg B3-Bg 1.640 + pavgb m5, m0 ; A1 - Ag 1.641 + punpcklbw m6, m4, m5 ; A-B8 ... A-B1 1.642 + punpckhbw m4, m5 ; A-B9 ... A-Bg 1.643 + pshufb m6, [GLOBAL(sh_bfedcba9876543210)] 1.644 + pshufb m4, [GLOBAL(sh_bfedcba9876543210)] 1.645 + 1.646 + DEFINE_ARGS dst, stride, stride3, left, line 1.647 + lea stride3q, [strideq*3] 1.648 + 1.649 + palignr m5, m2, m1, 14 1.650 + palignr m7, m1, m6, 14 1.651 + mova [dstq ], m7 1.652 + mova [dstq+16 ], m5 1.653 + palignr m5, m2, m1, 12 1.654 + palignr m7, m1, m6, 12 1.655 + mova [dstq+strideq ], m7 1.656 + mova [dstq+strideq+16 ], m5 1.657 + palignr m5, m2, m1, 10 1.658 + palignr m7, m1, m6, 10 1.659 + mova [dstq+strideq*2 ], m7 1.660 + mova [dstq+strideq*2+16], m5 1.661 + palignr m5, m2, m1, 8 1.662 + palignr m7, m1, m6, 8 1.663 + mova [dstq+stride3q ], m7 1.664 + mova [dstq+stride3q+16 ], m5 1.665 + lea dstq, [dstq+strideq*4] 1.666 + palignr m5, m2, m1, 6 1.667 + palignr m7, m1, m6, 6 1.668 + mova [dstq ], m7 1.669 + mova [dstq+16 ], m5 1.670 + palignr m5, m2, m1, 4 1.671 + palignr m7, m1, m6, 4 1.672 + mova [dstq+strideq ], m7 1.673 + mova [dstq+strideq+16 ], m5 1.674 + palignr m5, m2, m1, 2 1.675 + palignr m7, m1, m6, 2 1.676 + mova [dstq+strideq*2 ], m7 1.677 + mova [dstq+strideq*2+16], m5 1.678 + mova [dstq+stride3q ], m6 1.679 + mova [dstq+stride3q+16 ], m1 1.680 + lea dstq, [dstq+strideq*4] 1.681 + 1.682 + palignr m5, m1, m6, 14 1.683 + palignr m3, m6, m4, 14 1.684 + mova [dstq ], m3 1.685 + mova [dstq+16 ], m5 1.686 + palignr m5, m1, m6, 12 1.687 + palignr m3, m6, m4, 12 1.688 + mova [dstq+strideq ], m3 1.689 + mova [dstq+strideq+16 ], m5 1.690 + palignr m5, m1, m6, 10 1.691 + palignr m3, m6, m4, 10 1.692 + mova [dstq+strideq*2 ], m3 1.693 + mova [dstq+strideq*2+16], m5 1.694 + palignr m5, m1, m6, 8 1.695 + palignr m3, m6, m4, 8 1.696 + mova [dstq+stride3q ], m3 1.697 + mova [dstq+stride3q+16 ], m5 1.698 + lea dstq, [dstq+strideq*4] 1.699 + palignr m5, m1, m6, 6 1.700 + palignr m3, m6, m4, 6 1.701 + mova [dstq ], m3 1.702 + mova [dstq+16 ], m5 1.703 + palignr m5, m1, m6, 4 1.704 + palignr m3, m6, m4, 4 1.705 + mova [dstq+strideq ], m3 1.706 + mova [dstq+strideq+16 ], m5 1.707 + palignr m5, m1, m6, 2 1.708 + palignr m3, m6, m4, 2 1.709 + mova [dstq+strideq*2 ], m3 1.710 + mova [dstq+strideq*2+16], m5 1.711 + mova [dstq+stride3q ], m4 1.712 + mova [dstq+stride3q+16 ], m6 1.713 + lea dstq, [dstq+strideq*4] 1.714 + 1.715 + mova m7, [leftq] 1.716 + mova m3, [leftq+16] 1.717 + palignr m5, m3, m7, 15 1.718 + palignr m0, m3, m7, 14 1.719 + 1.720 + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m3, m5, m0, m2 ; 3-tap avg Bh - 1.721 + pavgb m5, m3 ; Ah - 1.722 + punpcklbw m3, m2, m5 ; A-B8 ... A-B1 1.723 + punpckhbw m2, m5 ; A-B9 ... A-Bg 1.724 + pshufb m3, [GLOBAL(sh_bfedcba9876543210)] 1.725 + pshufb m2, [GLOBAL(sh_bfedcba9876543210)] 1.726 + 1.727 + palignr m7, m6, m4, 14 1.728 + palignr m0, m4, m3, 14 1.729 + mova [dstq ], m0 1.730 + mova [dstq+16 ], m7 1.731 + palignr m7, m6, m4, 12 1.732 + palignr m0, m4, m3, 12 1.733 + mova [dstq+strideq ], m0 1.734 + mova [dstq+strideq+16 ], m7 1.735 + palignr m7, m6, m4, 10 1.736 + palignr m0, m4, m3, 10 1.737 + mova [dstq+strideq*2 ], m0 1.738 + mova [dstq+strideq*2+16], m7 1.739 + palignr m7, m6, m4, 8 1.740 + palignr m0, m4, m3, 8 1.741 + mova [dstq+stride3q ], m0 1.742 + mova [dstq+stride3q+16 ], m7 1.743 + lea dstq, [dstq+strideq*4] 1.744 + palignr m7, m6, m4, 6 1.745 + palignr m0, m4, m3, 6 1.746 + mova [dstq ], m0 1.747 + mova [dstq+16 ], m7 1.748 + palignr m7, m6, m4, 4 1.749 + palignr m0, m4, m3, 4 1.750 + mova [dstq+strideq ], m0 1.751 + mova [dstq+strideq+16 ], m7 1.752 + palignr m7, m6, m4, 2 1.753 + palignr m0, m4, m3, 2 1.754 + mova [dstq+strideq*2 ], m0 1.755 + mova [dstq+strideq*2+16], m7 1.756 + mova [dstq+stride3q ], m3 1.757 + mova [dstq+stride3q+16 ], m4 1.758 + lea dstq, [dstq+strideq*4] 1.759 + 1.760 + palignr m7, m4, m3, 14 1.761 + palignr m0, m3, m2, 14 1.762 + mova [dstq ], m0 1.763 + mova [dstq+16 ], m7 1.764 + palignr m7, m4, m3, 12 1.765 + palignr m0, m3, m2, 12 1.766 + mova [dstq+strideq ], m0 1.767 + mova [dstq+strideq+16 ], m7 1.768 + palignr m7, m4, m3, 10 1.769 + palignr m0, m3, m2, 10 1.770 + mova [dstq+strideq*2 ], m0 1.771 + mova [dstq+strideq*2+16], m7 1.772 + palignr m7, m4, m3, 8 1.773 + palignr m0, m3, m2, 8 1.774 + mova [dstq+stride3q ], m0 1.775 + mova [dstq+stride3q+16 ], m7 1.776 + lea dstq, [dstq+strideq*4] 1.777 + palignr m7, m4, m3, 6 1.778 + palignr m0, m3, m2, 6 1.779 + mova [dstq ], m0 1.780 + mova [dstq+16 ], m7 1.781 + palignr m7, m4, m3, 4 1.782 + palignr m0, m3, m2, 4 1.783 + mova [dstq+strideq ], m0 1.784 + mova [dstq+strideq+16 ], m7 1.785 + palignr m7, m4, m3, 2 1.786 + palignr m0, m3, m2, 2 1.787 + mova [dstq+strideq*2 ], m0 1.788 + mova [dstq+strideq*2+16], m7 1.789 + mova [dstq+stride3q ], m2 1.790 + mova [dstq+stride3q+16 ], m3 1.791 + 1.792 + RESTORE_GOT 1.793 + RET 1.794 + 1.795 +INIT_MMX ssse3 1.796 +cglobal d207_predictor_4x4, 4, 5, 4, dst, stride, unused, left, goffset 1.797 + GET_GOT goffsetq 1.798 + movd m0, [leftq] ; abcd [byte] 1.799 + pshufb m1, m0, [GLOBAL(sh_b1233)] ; bcdd [byte] 1.800 + pshufb m3, m0, [GLOBAL(sh_b2333)] ; cddd 1.801 + 1.802 + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m3, m2 1.803 + pavgb m1, m0 ; ab, bc, cd, d [byte] 1.804 + 1.805 + punpcklbw m1, m2 ; ab, a2bc, bc, b2cd, cd, c3d, d, d 1.806 + movd [dstq ], m1 1.807 + psrlq m1, 16 ; bc, b2cd, cd, c3d, d, d 1.808 + movd [dstq+strideq], m1 1.809 + lea dstq, [dstq+strideq*2] 1.810 + psrlq m1, 16 ; cd, c3d, d, d 1.811 + movd [dstq ], m1 1.812 + pshufw m1, m1, q1111 ; d, d, d, d 1.813 + movd [dstq+strideq], m1 1.814 + RESTORE_GOT 1.815 + RET 1.816 + 1.817 +INIT_XMM ssse3 1.818 +cglobal d207_predictor_8x8, 4, 5, 4, dst, stride, stride3, left, goffset 1.819 + GET_GOT goffsetq 1.820 + movq m3, [leftq] ; abcdefgh [byte] 1.821 + lea stride3q, [strideq*3] 1.822 + 1.823 + pshufb m1, m3, [GLOBAL(sh_b2345677777777777)] 1.824 + pshufb m0, m3, [GLOBAL(sh_b0123456777777777)] 1.825 + pshufb m2, m3, [GLOBAL(sh_b1234567777777777)] 1.826 + 1.827 + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m2, m1, m3 1.828 + pavgb m0, m2 1.829 + punpcklbw m0, m3 ; interleaved output 1.830 + 1.831 + movq [dstq ], m0 1.832 + psrldq m0, 2 1.833 + movq [dstq+strideq ], m0 1.834 + psrldq m0, 2 1.835 + movq [dstq+strideq*2], m0 1.836 + psrldq m0, 2 1.837 + movq [dstq+stride3q ], m0 1.838 + lea dstq, [dstq+strideq*4] 1.839 + pshufhw m0, m0, q0000 ; de, d2ef, ef, e2fg, fg, f2gh, gh, g3h, 8xh 1.840 + psrldq m0, 2 1.841 + movq [dstq ], m0 1.842 + psrldq m0, 2 1.843 + movq [dstq+strideq ], m0 1.844 + psrldq m0, 2 1.845 + movq [dstq+strideq*2], m0 1.846 + psrldq m0, 2 1.847 + movq [dstq+stride3q ], m0 1.848 + RESTORE_GOT 1.849 + RET 1.850 + 1.851 +INIT_XMM ssse3 1.852 +cglobal d207_predictor_16x16, 4, 5, 5, dst, stride, stride3, left, goffset 1.853 + GET_GOT goffsetq 1.854 + lea stride3q, [strideq*3] 1.855 + mova m0, [leftq] ; abcdefghijklmnop [byte] 1.856 + pshufb m1, m0, [GLOBAL(sh_b123456789abcdeff)] ; bcdefghijklmnopp 1.857 + pshufb m2, m0, [GLOBAL(sh_b23456789abcdefff)] 1.858 + 1.859 + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3 1.860 + pavgb m1, m0 ; ab, bc, cd .. no, op, pp [byte] 1.861 + 1.862 + punpckhbw m4, m1, m3 ; interleaved input 1.863 + punpcklbw m1, m3 ; interleaved output 1.864 + mova [dstq ], m1 1.865 + palignr m3, m4, m1, 2 1.866 + mova [dstq+strideq ], m3 1.867 + palignr m3, m4, m1, 4 1.868 + mova [dstq+strideq*2], m3 1.869 + palignr m3, m4, m1, 6 1.870 + mova [dstq+stride3q ], m3 1.871 + lea dstq, [dstq+strideq*4] 1.872 + palignr m3, m4, m1, 8 1.873 + mova [dstq ], m3 1.874 + palignr m3, m4, m1, 10 1.875 + mova [dstq+strideq ], m3 1.876 + palignr m3, m4, m1, 12 1.877 + mova [dstq+strideq*2], m3 1.878 + palignr m3, m4, m1, 14 1.879 + mova [dstq+stride3q ], m3 1.880 + DEFINE_ARGS dst, stride, stride3, line 1.881 + mov lined, 2 1.882 + mova m0, [GLOBAL(sh_b23456789abcdefff)] 1.883 +.loop: 1.884 + lea dstq, [dstq+strideq*4] 1.885 + mova [dstq ], m4 1.886 + pshufb m4, m0 1.887 + mova [dstq+strideq ], m4 1.888 + pshufb m4, m0 1.889 + mova [dstq+strideq*2], m4 1.890 + pshufb m4, m0 1.891 + mova [dstq+stride3q ], m4 1.892 + pshufb m4, m0 1.893 + dec lined 1.894 + jnz .loop 1.895 + RESTORE_GOT 1.896 + REP_RET 1.897 + 1.898 +INIT_XMM ssse3 1.899 +cglobal d207_predictor_32x32, 4, 5, 8, dst, stride, stride3, left, goffset 1.900 + GET_GOT goffsetq 1.901 + lea stride3q, [strideq*3] 1.902 + mova m1, [leftq] ; 0-15 [byte] 1.903 + mova m2, [leftq+16] ; 16-31 [byte] 1.904 + pshufb m0, m2, [GLOBAL(sh_b23456789abcdefff)] 1.905 + pshufb m4, m2, [GLOBAL(sh_b123456789abcdeff)] 1.906 + 1.907 + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m2, m4, m0, m3 1.908 + palignr m6, m2, m1, 1 1.909 + palignr m5, m2, m1, 2 1.910 + pavgb m2, m4 ; high 16px even lines 1.911 + 1.912 + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m1, m6, m5, m0 1.913 + pavgb m1, m6 ; low 16px even lines 1.914 + 1.915 + punpckhbw m6, m1, m0 ; interleaved output 2 1.916 + punpcklbw m1, m0 ; interleaved output 1 1.917 + 1.918 + punpckhbw m7, m2, m3 ; interleaved output 4 1.919 + punpcklbw m2, m3 ; interleaved output 3 1.920 + 1.921 + ; output 1st 8 lines (and half of 2nd 8 lines) 1.922 + DEFINE_ARGS dst, stride, stride3, dst8 1.923 + lea dst8q, [dstq+strideq*8] 1.924 + mova [dstq ], m1 1.925 + mova [dstq +16], m6 1.926 + mova [dst8q ], m6 1.927 + palignr m0, m6, m1, 2 1.928 + palignr m4, m2, m6, 2 1.929 + mova [dstq +strideq ], m0 1.930 + mova [dstq +strideq +16], m4 1.931 + mova [dst8q+strideq ], m4 1.932 + palignr m0, m6, m1, 4 1.933 + palignr m4, m2, m6, 4 1.934 + mova [dstq +strideq*2 ], m0 1.935 + mova [dstq +strideq*2+16], m4 1.936 + mova [dst8q+strideq*2 ], m4 1.937 + palignr m0, m6, m1, 6 1.938 + palignr m4, m2, m6, 6 1.939 + mova [dstq +stride3q ], m0 1.940 + mova [dstq +stride3q +16], m4 1.941 + mova [dst8q+stride3q ], m4 1.942 + lea dstq, [dstq +strideq*4] 1.943 + lea dst8q, [dst8q+strideq*4] 1.944 + palignr m0, m6, m1, 8 1.945 + palignr m4, m2, m6, 8 1.946 + mova [dstq ], m0 1.947 + mova [dstq +16], m4 1.948 + mova [dst8q ], m4 1.949 + palignr m0, m6, m1, 10 1.950 + palignr m4, m2, m6, 10 1.951 + mova [dstq +strideq ], m0 1.952 + mova [dstq +strideq +16], m4 1.953 + mova [dst8q+strideq ], m4 1.954 + palignr m0, m6, m1, 12 1.955 + palignr m4, m2, m6, 12 1.956 + mova [dstq +strideq*2 ], m0 1.957 + mova [dstq +strideq*2+16], m4 1.958 + mova [dst8q+strideq*2 ], m4 1.959 + palignr m0, m6, m1, 14 1.960 + palignr m4, m2, m6, 14 1.961 + mova [dstq +stride3q ], m0 1.962 + mova [dstq +stride3q +16], m4 1.963 + mova [dst8q+stride3q ], m4 1.964 + lea dstq, [dstq+strideq*4] 1.965 + lea dst8q, [dst8q+strideq*4] 1.966 + 1.967 + ; output 2nd half of 2nd 8 lines and half of 3rd 8 lines 1.968 + mova [dstq +16], m2 1.969 + mova [dst8q ], m2 1.970 + palignr m4, m7, m2, 2 1.971 + mova [dstq +strideq +16], m4 1.972 + mova [dst8q+strideq ], m4 1.973 + palignr m4, m7, m2, 4 1.974 + mova [dstq +strideq*2+16], m4 1.975 + mova [dst8q+strideq*2 ], m4 1.976 + palignr m4, m7, m2, 6 1.977 + mova [dstq +stride3q +16], m4 1.978 + mova [dst8q+stride3q ], m4 1.979 + lea dstq, [dstq+strideq*4] 1.980 + lea dst8q, [dst8q+strideq*4] 1.981 + palignr m4, m7, m2, 8 1.982 + mova [dstq +16], m4 1.983 + mova [dst8q ], m4 1.984 + palignr m4, m7, m2, 10 1.985 + mova [dstq +strideq +16], m4 1.986 + mova [dst8q+strideq ], m4 1.987 + palignr m4, m7, m2, 12 1.988 + mova [dstq +strideq*2+16], m4 1.989 + mova [dst8q+strideq*2 ], m4 1.990 + palignr m4, m7, m2, 14 1.991 + mova [dstq +stride3q +16], m4 1.992 + mova [dst8q+stride3q ], m4 1.993 + lea dstq, [dstq+strideq*4] 1.994 + lea dst8q, [dst8q+strideq*4] 1.995 + 1.996 + ; output 2nd half of 3rd 8 lines and half of 4th 8 lines 1.997 + mova m0, [GLOBAL(sh_b23456789abcdefff)] 1.998 + mova [dstq +16], m7 1.999 + mova [dst8q ], m7 1.1000 + pshufb m7, m0 1.1001 + mova [dstq +strideq +16], m7 1.1002 + mova [dst8q+strideq ], m7 1.1003 + pshufb m7, m0 1.1004 + mova [dstq +strideq*2+16], m7 1.1005 + mova [dst8q+strideq*2 ], m7 1.1006 + pshufb m7, m0 1.1007 + mova [dstq +stride3q +16], m7 1.1008 + mova [dst8q+stride3q ], m7 1.1009 + pshufb m7, m0 1.1010 + lea dstq, [dstq+strideq*4] 1.1011 + lea dst8q, [dst8q+strideq*4] 1.1012 + mova [dstq +16], m7 1.1013 + mova [dst8q ], m7 1.1014 + pshufb m7, m0 1.1015 + mova [dstq +strideq +16], m7 1.1016 + mova [dst8q+strideq ], m7 1.1017 + pshufb m7, m0 1.1018 + mova [dstq +strideq*2+16], m7 1.1019 + mova [dst8q+strideq*2 ], m7 1.1020 + pshufb m7, m0 1.1021 + mova [dstq +stride3q +16], m7 1.1022 + mova [dst8q+stride3q ], m7 1.1023 + pshufb m7, m0 1.1024 + lea dstq, [dstq+strideq*4] 1.1025 + 1.1026 + ; output last half of 4th 8 lines 1.1027 + mova [dstq +16], m7 1.1028 + mova [dstq +strideq +16], m7 1.1029 + mova [dstq +strideq*2+16], m7 1.1030 + mova [dstq +stride3q +16], m7 1.1031 + lea dstq, [dstq+strideq*4] 1.1032 + mova [dstq +16], m7 1.1033 + mova [dstq +strideq +16], m7 1.1034 + mova [dstq +strideq*2+16], m7 1.1035 + mova [dstq +stride3q +16], m7 1.1036 + 1.1037 + ; done! 1.1038 + RESTORE_GOT 1.1039 + RET