media/libvpx/vp9/common/x86/vp9_intrapred_ssse3.asm

Thu, 15 Jan 2015 15:59:08 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 15 Jan 2015 15:59:08 +0100
branch
TOR_BUG_9701
changeset 10
ac0c01689b40
permissions
-rw-r--r--

Implement a real Private Browsing Mode condition by changing the API/ABI;
This solves Tor bug #9701, complying with disk avoidance documented in
https://www.torproject.org/projects/torbrowser/design/#disk-avoidance.

michael@0 1 ;
michael@0 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
michael@0 3 ;
michael@0 4 ; Use of this source code is governed by a BSD-style license
michael@0 5 ; that can be found in the LICENSE file in the root of the source
michael@0 6 ; tree. An additional intellectual property rights grant can be found
michael@0 7 ; in the file PATENTS. All contributing project authors may
michael@0 8 ; be found in the AUTHORS file in the root of the source tree.
michael@0 9 ;
michael@0 10
michael@0 11 %include "third_party/x86inc/x86inc.asm"
michael@0 12
michael@0 13 SECTION_RODATA
michael@0 14
michael@0 15 pb_1: times 16 db 1
michael@0 16 sh_b01234577: db 0, 1, 2, 3, 4, 5, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0
michael@0 17 sh_b12345677: db 1, 2, 3, 4, 5, 6, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0
michael@0 18 sh_b23456777: db 2, 3, 4, 5, 6, 7, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0
michael@0 19 sh_b0123456777777777: db 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7
michael@0 20 sh_b1234567777777777: db 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7
michael@0 21 sh_b2345677777777777: db 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7
michael@0 22 sh_b123456789abcdeff: db 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15
michael@0 23 sh_b23456789abcdefff: db 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 15
michael@0 24 sh_b32104567: db 3, 2, 1, 0, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0
michael@0 25 sh_b8091a2b345: db 8, 0, 9, 1, 10, 2, 11, 3, 4, 5, 0, 0, 0, 0, 0, 0
michael@0 26 sh_b76543210: db 7, 6, 5, 4, 3, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0
michael@0 27 sh_b65432108: db 6, 5, 4, 3, 2, 1, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0
michael@0 28 sh_b54321089: db 5, 4, 3, 2, 1, 0, 8, 9, 0, 0, 0, 0, 0, 0, 0, 0
michael@0 29 sh_b89abcdef: db 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0
michael@0 30 sh_bfedcba9876543210: db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
michael@0 31 sh_b1233: db 1, 2, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
michael@0 32 sh_b2333: db 2, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
michael@0 33
michael@0 34 SECTION .text
michael@0 35
michael@0 36 INIT_MMX ssse3
michael@0 37 cglobal h_predictor_4x4, 2, 4, 3, dst, stride, line, left
michael@0 38 movifnidn leftq, leftmp
michael@0 39 add leftq, 4
michael@0 40 mov lineq, -2
michael@0 41 pxor m0, m0
michael@0 42 .loop:
michael@0 43 movd m1, [leftq+lineq*2 ]
michael@0 44 movd m2, [leftq+lineq*2+1]
michael@0 45 pshufb m1, m0
michael@0 46 pshufb m2, m0
michael@0 47 movd [dstq ], m1
michael@0 48 movd [dstq+strideq], m2
michael@0 49 lea dstq, [dstq+strideq*2]
michael@0 50 inc lineq
michael@0 51 jnz .loop
michael@0 52 REP_RET
michael@0 53
michael@0 54 INIT_MMX ssse3
michael@0 55 cglobal h_predictor_8x8, 2, 4, 3, dst, stride, line, left
michael@0 56 movifnidn leftq, leftmp
michael@0 57 add leftq, 8
michael@0 58 mov lineq, -4
michael@0 59 pxor m0, m0
michael@0 60 .loop:
michael@0 61 movd m1, [leftq+lineq*2 ]
michael@0 62 movd m2, [leftq+lineq*2+1]
michael@0 63 pshufb m1, m0
michael@0 64 pshufb m2, m0
michael@0 65 movq [dstq ], m1
michael@0 66 movq [dstq+strideq], m2
michael@0 67 lea dstq, [dstq+strideq*2]
michael@0 68 inc lineq
michael@0 69 jnz .loop
michael@0 70 REP_RET
michael@0 71
michael@0 72 INIT_XMM ssse3
michael@0 73 cglobal h_predictor_16x16, 2, 4, 3, dst, stride, line, left
michael@0 74 movifnidn leftq, leftmp
michael@0 75 add leftq, 16
michael@0 76 mov lineq, -8
michael@0 77 pxor m0, m0
michael@0 78 .loop:
michael@0 79 movd m1, [leftq+lineq*2 ]
michael@0 80 movd m2, [leftq+lineq*2+1]
michael@0 81 pshufb m1, m0
michael@0 82 pshufb m2, m0
michael@0 83 mova [dstq ], m1
michael@0 84 mova [dstq+strideq], m2
michael@0 85 lea dstq, [dstq+strideq*2]
michael@0 86 inc lineq
michael@0 87 jnz .loop
michael@0 88 REP_RET
michael@0 89
michael@0 90 INIT_XMM ssse3
michael@0 91 cglobal h_predictor_32x32, 2, 4, 3, dst, stride, line, left
michael@0 92 movifnidn leftq, leftmp
michael@0 93 add leftq, 32
michael@0 94 mov lineq, -16
michael@0 95 pxor m0, m0
michael@0 96 .loop:
michael@0 97 movd m1, [leftq+lineq*2 ]
michael@0 98 movd m2, [leftq+lineq*2+1]
michael@0 99 pshufb m1, m0
michael@0 100 pshufb m2, m0
michael@0 101 mova [dstq ], m1
michael@0 102 mova [dstq +16], m1
michael@0 103 mova [dstq+strideq ], m2
michael@0 104 mova [dstq+strideq+16], m2
michael@0 105 lea dstq, [dstq+strideq*2]
michael@0 106 inc lineq
michael@0 107 jnz .loop
michael@0 108 REP_RET
michael@0 109
michael@0 110 INIT_MMX ssse3
michael@0 111 cglobal d45_predictor_4x4, 3, 4, 4, dst, stride, above, goffset
michael@0 112 GET_GOT goffsetq
michael@0 113
michael@0 114 movq m0, [aboveq]
michael@0 115 pshufb m2, m0, [GLOBAL(sh_b23456777)]
michael@0 116 pshufb m1, m0, [GLOBAL(sh_b01234577)]
michael@0 117 pshufb m0, [GLOBAL(sh_b12345677)]
michael@0 118 pavgb m3, m2, m1
michael@0 119 pxor m2, m1
michael@0 120 pand m2, [GLOBAL(pb_1)]
michael@0 121 psubb m3, m2
michael@0 122 pavgb m0, m3
michael@0 123
michael@0 124 ; store 4 lines
michael@0 125 movd [dstq ], m0
michael@0 126 psrlq m0, 8
michael@0 127 movd [dstq+strideq], m0
michael@0 128 lea dstq, [dstq+strideq*2]
michael@0 129 psrlq m0, 8
michael@0 130 movd [dstq ], m0
michael@0 131 psrlq m0, 8
michael@0 132 movd [dstq+strideq], m0
michael@0 133
michael@0 134 RESTORE_GOT
michael@0 135 RET
michael@0 136
michael@0 137 INIT_MMX ssse3
michael@0 138 cglobal d45_predictor_8x8, 3, 4, 4, dst, stride, above, goffset
michael@0 139 GET_GOT goffsetq
michael@0 140
michael@0 141 movq m0, [aboveq]
michael@0 142 mova m1, [GLOBAL(sh_b12345677)]
michael@0 143 DEFINE_ARGS dst, stride, stride3
michael@0 144 lea stride3q, [strideq*3]
michael@0 145 pshufb m2, m0, [GLOBAL(sh_b23456777)]
michael@0 146 pavgb m3, m2, m0
michael@0 147 pxor m2, m0
michael@0 148 pshufb m0, m1
michael@0 149 pand m2, [GLOBAL(pb_1)]
michael@0 150 psubb m3, m2
michael@0 151 pavgb m0, m3
michael@0 152
michael@0 153 ; store 4 lines
michael@0 154 movq [dstq ], m0
michael@0 155 pshufb m0, m1
michael@0 156 movq [dstq+strideq ], m0
michael@0 157 pshufb m0, m1
michael@0 158 movq [dstq+strideq*2], m0
michael@0 159 pshufb m0, m1
michael@0 160 movq [dstq+stride3q ], m0
michael@0 161 pshufb m0, m1
michael@0 162 lea dstq, [dstq+strideq*4]
michael@0 163
michael@0 164 ; store next 4 lines
michael@0 165 movq [dstq ], m0
michael@0 166 pshufb m0, m1
michael@0 167 movq [dstq+strideq ], m0
michael@0 168 pshufb m0, m1
michael@0 169 movq [dstq+strideq*2], m0
michael@0 170 pshufb m0, m1
michael@0 171 movq [dstq+stride3q ], m0
michael@0 172
michael@0 173 RESTORE_GOT
michael@0 174 RET
michael@0 175
michael@0 176 INIT_XMM ssse3
michael@0 177 cglobal d45_predictor_16x16, 3, 6, 4, dst, stride, above, dst8, line, goffset
michael@0 178 GET_GOT goffsetq
michael@0 179
michael@0 180 mova m0, [aboveq]
michael@0 181 DEFINE_ARGS dst, stride, stride3, dst8, line
michael@0 182 lea stride3q, [strideq*3]
michael@0 183 lea dst8q, [dstq+strideq*8]
michael@0 184 mova m1, [GLOBAL(sh_b123456789abcdeff)]
michael@0 185 pshufb m2, m0, [GLOBAL(sh_b23456789abcdefff)]
michael@0 186 pavgb m3, m2, m0
michael@0 187 pxor m2, m0
michael@0 188 pshufb m0, m1
michael@0 189 pand m2, [GLOBAL(pb_1)]
michael@0 190 psubb m3, m2
michael@0 191 pavgb m0, m3
michael@0 192
michael@0 193 ; first 4 lines and first half of 3rd 4 lines
michael@0 194 mov lined, 2
michael@0 195 .loop:
michael@0 196 mova [dstq ], m0
michael@0 197 movhps [dst8q ], m0
michael@0 198 pshufb m0, m1
michael@0 199 mova [dstq +strideq ], m0
michael@0 200 movhps [dst8q+strideq ], m0
michael@0 201 pshufb m0, m1
michael@0 202 mova [dstq +strideq*2 ], m0
michael@0 203 movhps [dst8q+strideq*2 ], m0
michael@0 204 pshufb m0, m1
michael@0 205 mova [dstq +stride3q ], m0
michael@0 206 movhps [dst8q+stride3q ], m0
michael@0 207 pshufb m0, m1
michael@0 208 lea dstq, [dstq +strideq*4]
michael@0 209 lea dst8q, [dst8q+strideq*4]
michael@0 210 dec lined
michael@0 211 jnz .loop
michael@0 212
michael@0 213 ; bottom-right 8x8 block
michael@0 214 movhps [dstq +8], m0
michael@0 215 movhps [dstq+strideq +8], m0
michael@0 216 movhps [dstq+strideq*2+8], m0
michael@0 217 movhps [dstq+stride3q +8], m0
michael@0 218 lea dstq, [dstq+strideq*4]
michael@0 219 movhps [dstq +8], m0
michael@0 220 movhps [dstq+strideq +8], m0
michael@0 221 movhps [dstq+strideq*2+8], m0
michael@0 222 movhps [dstq+stride3q +8], m0
michael@0 223
michael@0 224 RESTORE_GOT
michael@0 225 RET
michael@0 226
michael@0 227 INIT_XMM ssse3
michael@0 228 cglobal d45_predictor_32x32, 3, 6, 7, dst, stride, above, dst16, line, goffset
michael@0 229 GET_GOT goffsetq
michael@0 230
michael@0 231 mova m0, [aboveq]
michael@0 232 mova m4, [aboveq+16]
michael@0 233 DEFINE_ARGS dst, stride, stride3, dst16, line
michael@0 234 lea stride3q, [strideq*3]
michael@0 235 lea dst16q, [dstq +strideq*8]
michael@0 236 lea dst16q, [dst16q+strideq*8]
michael@0 237 mova m1, [GLOBAL(sh_b123456789abcdeff)]
michael@0 238 pshufb m2, m4, [GLOBAL(sh_b23456789abcdefff)]
michael@0 239 pavgb m3, m2, m4
michael@0 240 pxor m2, m4
michael@0 241 palignr m5, m4, m0, 1
michael@0 242 palignr m6, m4, m0, 2
michael@0 243 pshufb m4, m1
michael@0 244 pand m2, [GLOBAL(pb_1)]
michael@0 245 psubb m3, m2
michael@0 246 pavgb m4, m3
michael@0 247 pavgb m3, m0, m6
michael@0 248 pxor m0, m6
michael@0 249 pand m0, [GLOBAL(pb_1)]
michael@0 250 psubb m3, m0
michael@0 251 pavgb m5, m3
michael@0 252
michael@0 253 ; write 4x4 lines (and the first half of the second 4x4 lines)
michael@0 254 mov lined, 4
michael@0 255 .loop:
michael@0 256 mova [dstq ], m5
michael@0 257 mova [dstq +16], m4
michael@0 258 mova [dst16q ], m4
michael@0 259 palignr m3, m4, m5, 1
michael@0 260 pshufb m4, m1
michael@0 261 mova [dstq +strideq ], m3
michael@0 262 mova [dstq +strideq +16], m4
michael@0 263 mova [dst16q+strideq ], m4
michael@0 264 palignr m5, m4, m3, 1
michael@0 265 pshufb m4, m1
michael@0 266 mova [dstq +strideq*2 ], m5
michael@0 267 mova [dstq +strideq*2+16], m4
michael@0 268 mova [dst16q+strideq*2 ], m4
michael@0 269 palignr m3, m4, m5, 1
michael@0 270 pshufb m4, m1
michael@0 271 mova [dstq +stride3q ], m3
michael@0 272 mova [dstq +stride3q +16], m4
michael@0 273 mova [dst16q+stride3q ], m4
michael@0 274 palignr m5, m4, m3, 1
michael@0 275 pshufb m4, m1
michael@0 276 lea dstq, [dstq +strideq*4]
michael@0 277 lea dst16q, [dst16q+strideq*4]
michael@0 278 dec lined
michael@0 279 jnz .loop
michael@0 280
michael@0 281 ; write second half of second 4x4 lines
michael@0 282 mova [dstq +16], m4
michael@0 283 mova [dstq +strideq +16], m4
michael@0 284 mova [dstq +strideq*2+16], m4
michael@0 285 mova [dstq +stride3q +16], m4
michael@0 286 lea dstq, [dstq +strideq*4]
michael@0 287 mova [dstq +16], m4
michael@0 288 mova [dstq +strideq +16], m4
michael@0 289 mova [dstq +strideq*2+16], m4
michael@0 290 mova [dstq +stride3q +16], m4
michael@0 291 lea dstq, [dstq +strideq*4]
michael@0 292 mova [dstq +16], m4
michael@0 293 mova [dstq +strideq +16], m4
michael@0 294 mova [dstq +strideq*2+16], m4
michael@0 295 mova [dstq +stride3q +16], m4
michael@0 296 lea dstq, [dstq +strideq*4]
michael@0 297 mova [dstq +16], m4
michael@0 298 mova [dstq +strideq +16], m4
michael@0 299 mova [dstq +strideq*2+16], m4
michael@0 300 mova [dstq +stride3q +16], m4
michael@0 301
michael@0 302 RESTORE_GOT
michael@0 303 RET
michael@0 304
michael@0 305 ; ------------------------------------------
michael@0 306 ; input: x, y, z, result
michael@0 307 ;
michael@0 308 ; trick from pascal
michael@0 309 ; (x+2y+z+2)>>2 can be calculated as:
michael@0 310 ; result = avg(x,z)
michael@0 311 ; result -= xor(x,z) & 1
michael@0 312 ; result = avg(result,y)
michael@0 313 ; ------------------------------------------
michael@0 314 %macro X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 4
michael@0 315 pavgb %4, %1, %3
michael@0 316 pxor %3, %1
michael@0 317 pand %3, [GLOBAL(pb_1)]
michael@0 318 psubb %4, %3
michael@0 319 pavgb %4, %2
michael@0 320 %endmacro
michael@0 321
michael@0 322 INIT_XMM ssse3
michael@0 323 cglobal d63_predictor_4x4, 3, 4, 5, dst, stride, above, goffset
michael@0 324 GET_GOT goffsetq
michael@0 325
michael@0 326 movq m3, [aboveq]
michael@0 327 pshufb m1, m3, [GLOBAL(sh_b23456777)]
michael@0 328 pshufb m2, m3, [GLOBAL(sh_b12345677)]
michael@0 329
michael@0 330 X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m3, m2, m1, m4
michael@0 331 pavgb m3, m2
michael@0 332
michael@0 333 ; store 4 lines
michael@0 334 movd [dstq ], m3
michael@0 335 movd [dstq+strideq], m4
michael@0 336 lea dstq, [dstq+strideq*2]
michael@0 337 psrldq m3, 1
michael@0 338 psrldq m4, 1
michael@0 339 movd [dstq ], m3
michael@0 340 movd [dstq+strideq], m4
michael@0 341 RESTORE_GOT
michael@0 342 RET
michael@0 343
michael@0 344 INIT_XMM ssse3
michael@0 345 cglobal d63_predictor_8x8, 3, 4, 5, dst, stride, above, goffset
michael@0 346 GET_GOT goffsetq
michael@0 347
michael@0 348 movq m3, [aboveq]
michael@0 349 DEFINE_ARGS dst, stride, stride3
michael@0 350 lea stride3q, [strideq*3]
michael@0 351 pshufb m1, m3, [GLOBAL(sh_b2345677777777777)]
michael@0 352 pshufb m0, m3, [GLOBAL(sh_b0123456777777777)]
michael@0 353 pshufb m2, m3, [GLOBAL(sh_b1234567777777777)]
michael@0 354 pshufb m3, [GLOBAL(sh_b0123456777777777)]
michael@0 355
michael@0 356 X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m2, m1, m4
michael@0 357 pavgb m3, m2
michael@0 358
michael@0 359 ; store 4 lines
michael@0 360 movq [dstq ], m3
michael@0 361 movq [dstq+strideq], m4
michael@0 362 psrldq m3, 1
michael@0 363 psrldq m4, 1
michael@0 364 movq [dstq+strideq*2], m3
michael@0 365 movq [dstq+stride3q ], m4
michael@0 366 lea dstq, [dstq+strideq*4]
michael@0 367 psrldq m3, 1
michael@0 368 psrldq m4, 1
michael@0 369
michael@0 370 ; store 4 lines
michael@0 371 movq [dstq ], m3
michael@0 372 movq [dstq+strideq], m4
michael@0 373 psrldq m3, 1
michael@0 374 psrldq m4, 1
michael@0 375 movq [dstq+strideq*2], m3
michael@0 376 movq [dstq+stride3q ], m4
michael@0 377 RESTORE_GOT
michael@0 378 RET
michael@0 379
michael@0 380 INIT_XMM ssse3
michael@0 381 cglobal d63_predictor_16x16, 3, 5, 5, dst, stride, above, line, goffset
michael@0 382 GET_GOT goffsetq
michael@0 383
michael@0 384 mova m0, [aboveq]
michael@0 385 DEFINE_ARGS dst, stride, stride3, line
michael@0 386 lea stride3q, [strideq*3]
michael@0 387 mova m1, [GLOBAL(sh_b123456789abcdeff)]
michael@0 388 pshufb m2, m0, [GLOBAL(sh_b23456789abcdefff)]
michael@0 389 pshufb m3, m0, m1
michael@0 390
michael@0 391 X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m3, m2, m4
michael@0 392 pavgb m0, m3
michael@0 393
michael@0 394 mov lined, 4
michael@0 395 .loop:
michael@0 396 mova [dstq ], m0
michael@0 397 mova [dstq+strideq ], m4
michael@0 398 pshufb m0, m1
michael@0 399 pshufb m4, m1
michael@0 400 mova [dstq+strideq*2], m0
michael@0 401 mova [dstq+stride3q ], m4
michael@0 402 pshufb m0, m1
michael@0 403 pshufb m4, m1
michael@0 404 lea dstq, [dstq+strideq*4]
michael@0 405 dec lined
michael@0 406 jnz .loop
michael@0 407 RESTORE_GOT
michael@0 408 REP_RET
michael@0 409
michael@0 410 INIT_XMM ssse3
michael@0 411 cglobal d63_predictor_32x32, 3, 5, 8, dst, stride, above, line, goffset
michael@0 412 GET_GOT goffsetq
michael@0 413
michael@0 414 mova m0, [aboveq]
michael@0 415 mova m7, [aboveq+16]
michael@0 416 DEFINE_ARGS dst, stride, stride3, line
michael@0 417 mova m1, [GLOBAL(sh_b123456789abcdeff)]
michael@0 418 lea stride3q, [strideq*3]
michael@0 419 pshufb m2, m7, [GLOBAL(sh_b23456789abcdefff)]
michael@0 420 pshufb m3, m7, m1
michael@0 421
michael@0 422 X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m7, m3, m2, m4
michael@0 423 palignr m6, m7, m0, 1
michael@0 424 palignr m5, m7, m0, 2
michael@0 425 pavgb m7, m3
michael@0 426
michael@0 427 X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m6, m5, m2
michael@0 428 pavgb m0, m6
michael@0 429
michael@0 430 mov lined, 8
michael@0 431 .loop:
michael@0 432 mova [dstq ], m0
michael@0 433 mova [dstq +16], m7
michael@0 434 mova [dstq+strideq ], m2
michael@0 435 mova [dstq+strideq +16], m4
michael@0 436 palignr m3, m7, m0, 1
michael@0 437 palignr m5, m4, m2, 1
michael@0 438 pshufb m7, m1
michael@0 439 pshufb m4, m1
michael@0 440
michael@0 441 mova [dstq+strideq*2 ], m3
michael@0 442 mova [dstq+strideq*2+16], m7
michael@0 443 mova [dstq+stride3q ], m5
michael@0 444 mova [dstq+stride3q +16], m4
michael@0 445 palignr m0, m7, m3, 1
michael@0 446 palignr m2, m4, m5, 1
michael@0 447 pshufb m7, m1
michael@0 448 pshufb m4, m1
michael@0 449 lea dstq, [dstq+strideq*4]
michael@0 450 dec lined
michael@0 451 jnz .loop
michael@0 452 RESTORE_GOT
michael@0 453 REP_RET
michael@0 454
michael@0 455 INIT_XMM ssse3
michael@0 456 cglobal d153_predictor_4x4, 4, 5, 4, dst, stride, above, left, goffset
michael@0 457 GET_GOT goffsetq
michael@0 458 movd m0, [leftq] ; l1, l2, l3, l4
michael@0 459 movd m1, [aboveq-1] ; tl, t1, t2, t3
michael@0 460 punpckldq m0, m1 ; l1, l2, l3, l4, tl, t1, t2, t3
michael@0 461 pshufb m0, [GLOBAL(sh_b32104567)]; l4, l3, l2, l1, tl, t1, t2, t3
michael@0 462 psrldq m1, m0, 1 ; l3, l2, l1, tl, t1, t2, t3
michael@0 463 psrldq m2, m0, 2 ; l2, l1, tl, t1, t2, t3
michael@0 464 ; comments below are for a predictor like this
michael@0 465 ; A1 B1 C1 D1
michael@0 466 ; A2 B2 A1 B1
michael@0 467 ; A3 B3 A2 B2
michael@0 468 ; A4 B4 A3 B3
michael@0 469 X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3 ; 3-tap avg B4 B3 B2 B1 C1 D1
michael@0 470 pavgb m1, m0 ; 2-tap avg A4 A3 A2 A1
michael@0 471
michael@0 472 punpcklqdq m3, m1 ; B4 B3 B2 B1 C1 D1 x x A4 A3 A2 A1 ..
michael@0 473
michael@0 474 DEFINE_ARGS dst, stride, stride3
michael@0 475 lea stride3q, [strideq*3]
michael@0 476 pshufb m3, [GLOBAL(sh_b8091a2b345)] ; A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 ..
michael@0 477 movd [dstq+stride3q ], m3
michael@0 478 psrldq m3, 2 ; A3 B3 A2 B2 A1 B1 C1 D1 ..
michael@0 479 movd [dstq+strideq*2], m3
michael@0 480 psrldq m3, 2 ; A2 B2 A1 B1 C1 D1 ..
michael@0 481 movd [dstq+strideq ], m3
michael@0 482 psrldq m3, 2 ; A1 B1 C1 D1 ..
michael@0 483 movd [dstq ], m3
michael@0 484 RESTORE_GOT
michael@0 485 RET
michael@0 486
michael@0 487 INIT_XMM ssse3
michael@0 488 cglobal d153_predictor_8x8, 4, 5, 8, dst, stride, above, left, goffset
michael@0 489 GET_GOT goffsetq
michael@0 490 movq m0, [leftq] ; [0- 7] l1-8 [byte]
michael@0 491 movhps m0, [aboveq-1] ; [8-15] tl, t1-7 [byte]
michael@0 492 pshufb m1, m0, [GLOBAL(sh_b76543210)] ; l8-1 [word]
michael@0 493 pshufb m2, m0, [GLOBAL(sh_b65432108)] ; l7-1,tl [word]
michael@0 494 pshufb m3, m0, [GLOBAL(sh_b54321089)] ; l6-1,tl,t1 [word]
michael@0 495 pshufb m0, [GLOBAL(sh_b89abcdef)] ; tl,t1-7 [word]
michael@0 496 psrldq m4, m0, 1 ; t1-7 [word]
michael@0 497 psrldq m5, m0, 2 ; t2-7 [word]
michael@0 498 ; comments below are for a predictor like this
michael@0 499 ; A1 B1 C1 D1 E1 F1 G1 H1
michael@0 500 ; A2 B2 A1 B1 C1 D1 E1 F1
michael@0 501 ; A3 B3 A2 B2 A1 B1 C1 D1
michael@0 502 ; A4 B4 A3 B3 A2 B2 A1 B1
michael@0 503 ; A5 B5 A4 B4 A3 B3 A2 B2
michael@0 504 ; A6 B6 A5 B5 A4 B4 A3 B3
michael@0 505 ; A7 B7 A6 B6 A5 B5 A4 B4
michael@0 506 ; A8 B8 A7 B7 A6 B6 A5 B5
michael@0 507 pavgb m6, m1, m2 ; 2-tap avg A8-A1
michael@0 508
michael@0 509 X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m4, m5, m7 ; 3-tap avg C-H1
michael@0 510
michael@0 511 X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m1, m2, m3, m0 ; 3-tap avg B8-1
michael@0 512
michael@0 513 punpcklbw m6, m0 ; A-B8, A-B7 ... A-B2, A-B1
michael@0 514
michael@0 515 DEFINE_ARGS dst, stride, stride3
michael@0 516 lea stride3q, [strideq*3]
michael@0 517
michael@0 518 movhps [dstq+stride3q], m6 ; A-B4, A-B3, A-B2, A-B1
michael@0 519 palignr m0, m7, m6, 10 ; A-B3, A-B2, A-B1, C-H1
michael@0 520 movq [dstq+strideq*2], m0
michael@0 521 psrldq m0, 2 ; A-B2, A-B1, C-H1
michael@0 522 movq [dstq+strideq ], m0
michael@0 523 psrldq m0, 2 ; A-H1
michael@0 524 movq [dstq ], m0
michael@0 525 lea dstq, [dstq+strideq*4]
michael@0 526 movq [dstq+stride3q ], m6 ; A-B8, A-B7, A-B6, A-B5
michael@0 527 psrldq m6, 2 ; A-B7, A-B6, A-B5, A-B4
michael@0 528 movq [dstq+strideq*2], m6
michael@0 529 psrldq m6, 2 ; A-B6, A-B5, A-B4, A-B3
michael@0 530 movq [dstq+strideq ], m6
michael@0 531 psrldq m6, 2 ; A-B5, A-B4, A-B3, A-B2
michael@0 532 movq [dstq ], m6
michael@0 533 RESTORE_GOT
michael@0 534 RET
michael@0 535
michael@0 536 INIT_XMM ssse3
michael@0 537 cglobal d153_predictor_16x16, 4, 5, 8, dst, stride, above, left, goffset
michael@0 538 GET_GOT goffsetq
michael@0 539 mova m0, [leftq]
michael@0 540 movu m7, [aboveq-1]
michael@0 541 ; comments below are for a predictor like this
michael@0 542 ; A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1 L1 M1 N1 O1 P1
michael@0 543 ; A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1 L1 M1 N1
michael@0 544 ; A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1 L1
michael@0 545 ; A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 I1 J1
michael@0 546 ; A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 G1 H1
michael@0 547 ; A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 E1 F1
michael@0 548 ; A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 C1 D1
michael@0 549 ; A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 A1 B1
michael@0 550 ; A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 A2 B2
michael@0 551 ; Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 A3 B3
michael@0 552 ; Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 A4 B4
michael@0 553 ; Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 A5 B5
michael@0 554 ; Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 A6 B6
michael@0 555 ; Ae Be Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 A7 B7
michael@0 556 ; Af Bf Ae Be Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 A8 B8
michael@0 557 ; Ag Bg Af Bf Ae Be Ad Bd Ac Bc Ab Bb Aa Ba A9 B9
michael@0 558 pshufb m6, m7, [GLOBAL(sh_bfedcba9876543210)]
michael@0 559 palignr m5, m0, m6, 15
michael@0 560 palignr m3, m0, m6, 14
michael@0 561
michael@0 562 X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m5, m3, m4 ; 3-tap avg B3-Bg
michael@0 563 pshufb m1, m0, [GLOBAL(sh_b123456789abcdeff)]
michael@0 564 pavgb m5, m0 ; A1 - Ag
michael@0 565
michael@0 566 punpcklbw m0, m4, m5 ; A-B8 ... A-B1
michael@0 567 punpckhbw m4, m5 ; A-B9 ... A-Bg
michael@0 568
michael@0 569 pshufb m3, m7, [GLOBAL(sh_b123456789abcdeff)]
michael@0 570 pshufb m5, m7, [GLOBAL(sh_b23456789abcdefff)]
michael@0 571
michael@0 572 X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m7, m3, m5, m1 ; 3-tap avg C1-P1
michael@0 573
michael@0 574 pshufb m6, m0, [GLOBAL(sh_bfedcba9876543210)]
michael@0 575 DEFINE_ARGS dst, stride, stride3
michael@0 576 lea stride3q, [strideq*3]
michael@0 577 palignr m2, m1, m6, 14
michael@0 578 mova [dstq ], m2
michael@0 579 palignr m2, m1, m6, 12
michael@0 580 mova [dstq+strideq ], m2
michael@0 581 palignr m2, m1, m6, 10
michael@0 582 mova [dstq+strideq*2], m2
michael@0 583 palignr m2, m1, m6, 8
michael@0 584 mova [dstq+stride3q ], m2
michael@0 585 lea dstq, [dstq+strideq*4]
michael@0 586 palignr m2, m1, m6, 6
michael@0 587 mova [dstq ], m2
michael@0 588 palignr m2, m1, m6, 4
michael@0 589 mova [dstq+strideq ], m2
michael@0 590 palignr m2, m1, m6, 2
michael@0 591 mova [dstq+strideq*2], m2
michael@0 592 pshufb m4, [GLOBAL(sh_bfedcba9876543210)]
michael@0 593 mova [dstq+stride3q ], m6
michael@0 594 lea dstq, [dstq+strideq*4]
michael@0 595
michael@0 596 palignr m2, m6, m4, 14
michael@0 597 mova [dstq ], m2
michael@0 598 palignr m2, m6, m4, 12
michael@0 599 mova [dstq+strideq ], m2
michael@0 600 palignr m2, m6, m4, 10
michael@0 601 mova [dstq+strideq*2], m2
michael@0 602 palignr m2, m6, m4, 8
michael@0 603 mova [dstq+stride3q ], m2
michael@0 604 lea dstq, [dstq+strideq*4]
michael@0 605 palignr m2, m6, m4, 6
michael@0 606 mova [dstq ], m2
michael@0 607 palignr m2, m6, m4, 4
michael@0 608 mova [dstq+strideq ], m2
michael@0 609 palignr m2, m6, m4, 2
michael@0 610 mova [dstq+strideq*2], m2
michael@0 611 mova [dstq+stride3q ], m4
michael@0 612 RESTORE_GOT
michael@0 613 RET
michael@0 614
michael@0 615 INIT_XMM ssse3
michael@0 616 cglobal d153_predictor_32x32, 4, 5, 8, dst, stride, above, left, goffset
michael@0 617 GET_GOT goffsetq
michael@0 618 mova m0, [leftq]
michael@0 619 movu m7, [aboveq-1]
michael@0 620 movu m1, [aboveq+15]
michael@0 621
michael@0 622 pshufb m4, m1, [GLOBAL(sh_b123456789abcdeff)]
michael@0 623 pshufb m6, m1, [GLOBAL(sh_b23456789abcdefff)]
michael@0 624
michael@0 625 X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m1, m4, m6, m2 ; 3-tap avg above [high]
michael@0 626
michael@0 627 palignr m3, m1, m7, 1
michael@0 628 palignr m5, m1, m7, 2
michael@0 629
michael@0 630 X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m7, m3, m5, m1 ; 3-tap avg above [low]
michael@0 631
michael@0 632 pshufb m7, [GLOBAL(sh_bfedcba9876543210)]
michael@0 633 palignr m5, m0, m7, 15
michael@0 634 palignr m3, m0, m7, 14
michael@0 635
michael@0 636 X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m5, m3, m4 ; 3-tap avg B3-Bg
michael@0 637 pavgb m5, m0 ; A1 - Ag
michael@0 638 punpcklbw m6, m4, m5 ; A-B8 ... A-B1
michael@0 639 punpckhbw m4, m5 ; A-B9 ... A-Bg
michael@0 640 pshufb m6, [GLOBAL(sh_bfedcba9876543210)]
michael@0 641 pshufb m4, [GLOBAL(sh_bfedcba9876543210)]
michael@0 642
michael@0 643 DEFINE_ARGS dst, stride, stride3, left, line
michael@0 644 lea stride3q, [strideq*3]
michael@0 645
michael@0 646 palignr m5, m2, m1, 14
michael@0 647 palignr m7, m1, m6, 14
michael@0 648 mova [dstq ], m7
michael@0 649 mova [dstq+16 ], m5
michael@0 650 palignr m5, m2, m1, 12
michael@0 651 palignr m7, m1, m6, 12
michael@0 652 mova [dstq+strideq ], m7
michael@0 653 mova [dstq+strideq+16 ], m5
michael@0 654 palignr m5, m2, m1, 10
michael@0 655 palignr m7, m1, m6, 10
michael@0 656 mova [dstq+strideq*2 ], m7
michael@0 657 mova [dstq+strideq*2+16], m5
michael@0 658 palignr m5, m2, m1, 8
michael@0 659 palignr m7, m1, m6, 8
michael@0 660 mova [dstq+stride3q ], m7
michael@0 661 mova [dstq+stride3q+16 ], m5
michael@0 662 lea dstq, [dstq+strideq*4]
michael@0 663 palignr m5, m2, m1, 6
michael@0 664 palignr m7, m1, m6, 6
michael@0 665 mova [dstq ], m7
michael@0 666 mova [dstq+16 ], m5
michael@0 667 palignr m5, m2, m1, 4
michael@0 668 palignr m7, m1, m6, 4
michael@0 669 mova [dstq+strideq ], m7
michael@0 670 mova [dstq+strideq+16 ], m5
michael@0 671 palignr m5, m2, m1, 2
michael@0 672 palignr m7, m1, m6, 2
michael@0 673 mova [dstq+strideq*2 ], m7
michael@0 674 mova [dstq+strideq*2+16], m5
michael@0 675 mova [dstq+stride3q ], m6
michael@0 676 mova [dstq+stride3q+16 ], m1
michael@0 677 lea dstq, [dstq+strideq*4]
michael@0 678
michael@0 679 palignr m5, m1, m6, 14
michael@0 680 palignr m3, m6, m4, 14
michael@0 681 mova [dstq ], m3
michael@0 682 mova [dstq+16 ], m5
michael@0 683 palignr m5, m1, m6, 12
michael@0 684 palignr m3, m6, m4, 12
michael@0 685 mova [dstq+strideq ], m3
michael@0 686 mova [dstq+strideq+16 ], m5
michael@0 687 palignr m5, m1, m6, 10
michael@0 688 palignr m3, m6, m4, 10
michael@0 689 mova [dstq+strideq*2 ], m3
michael@0 690 mova [dstq+strideq*2+16], m5
michael@0 691 palignr m5, m1, m6, 8
michael@0 692 palignr m3, m6, m4, 8
michael@0 693 mova [dstq+stride3q ], m3
michael@0 694 mova [dstq+stride3q+16 ], m5
michael@0 695 lea dstq, [dstq+strideq*4]
michael@0 696 palignr m5, m1, m6, 6
michael@0 697 palignr m3, m6, m4, 6
michael@0 698 mova [dstq ], m3
michael@0 699 mova [dstq+16 ], m5
michael@0 700 palignr m5, m1, m6, 4
michael@0 701 palignr m3, m6, m4, 4
michael@0 702 mova [dstq+strideq ], m3
michael@0 703 mova [dstq+strideq+16 ], m5
michael@0 704 palignr m5, m1, m6, 2
michael@0 705 palignr m3, m6, m4, 2
michael@0 706 mova [dstq+strideq*2 ], m3
michael@0 707 mova [dstq+strideq*2+16], m5
michael@0 708 mova [dstq+stride3q ], m4
michael@0 709 mova [dstq+stride3q+16 ], m6
michael@0 710 lea dstq, [dstq+strideq*4]
michael@0 711
michael@0 712 mova m7, [leftq]
michael@0 713 mova m3, [leftq+16]
michael@0 714 palignr m5, m3, m7, 15
michael@0 715 palignr m0, m3, m7, 14
michael@0 716
michael@0 717 X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m3, m5, m0, m2 ; 3-tap avg Bh -
michael@0 718 pavgb m5, m3 ; Ah -
michael@0 719 punpcklbw m3, m2, m5 ; A-B8 ... A-B1
michael@0 720 punpckhbw m2, m5 ; A-B9 ... A-Bg
michael@0 721 pshufb m3, [GLOBAL(sh_bfedcba9876543210)]
michael@0 722 pshufb m2, [GLOBAL(sh_bfedcba9876543210)]
michael@0 723
michael@0 724 palignr m7, m6, m4, 14
michael@0 725 palignr m0, m4, m3, 14
michael@0 726 mova [dstq ], m0
michael@0 727 mova [dstq+16 ], m7
michael@0 728 palignr m7, m6, m4, 12
michael@0 729 palignr m0, m4, m3, 12
michael@0 730 mova [dstq+strideq ], m0
michael@0 731 mova [dstq+strideq+16 ], m7
michael@0 732 palignr m7, m6, m4, 10
michael@0 733 palignr m0, m4, m3, 10
michael@0 734 mova [dstq+strideq*2 ], m0
michael@0 735 mova [dstq+strideq*2+16], m7
michael@0 736 palignr m7, m6, m4, 8
michael@0 737 palignr m0, m4, m3, 8
michael@0 738 mova [dstq+stride3q ], m0
michael@0 739 mova [dstq+stride3q+16 ], m7
michael@0 740 lea dstq, [dstq+strideq*4]
michael@0 741 palignr m7, m6, m4, 6
michael@0 742 palignr m0, m4, m3, 6
michael@0 743 mova [dstq ], m0
michael@0 744 mova [dstq+16 ], m7
michael@0 745 palignr m7, m6, m4, 4
michael@0 746 palignr m0, m4, m3, 4
michael@0 747 mova [dstq+strideq ], m0
michael@0 748 mova [dstq+strideq+16 ], m7
michael@0 749 palignr m7, m6, m4, 2
michael@0 750 palignr m0, m4, m3, 2
michael@0 751 mova [dstq+strideq*2 ], m0
michael@0 752 mova [dstq+strideq*2+16], m7
michael@0 753 mova [dstq+stride3q ], m3
michael@0 754 mova [dstq+stride3q+16 ], m4
michael@0 755 lea dstq, [dstq+strideq*4]
michael@0 756
michael@0 757 palignr m7, m4, m3, 14
michael@0 758 palignr m0, m3, m2, 14
michael@0 759 mova [dstq ], m0
michael@0 760 mova [dstq+16 ], m7
michael@0 761 palignr m7, m4, m3, 12
michael@0 762 palignr m0, m3, m2, 12
michael@0 763 mova [dstq+strideq ], m0
michael@0 764 mova [dstq+strideq+16 ], m7
michael@0 765 palignr m7, m4, m3, 10
michael@0 766 palignr m0, m3, m2, 10
michael@0 767 mova [dstq+strideq*2 ], m0
michael@0 768 mova [dstq+strideq*2+16], m7
michael@0 769 palignr m7, m4, m3, 8
michael@0 770 palignr m0, m3, m2, 8
michael@0 771 mova [dstq+stride3q ], m0
michael@0 772 mova [dstq+stride3q+16 ], m7
michael@0 773 lea dstq, [dstq+strideq*4]
michael@0 774 palignr m7, m4, m3, 6
michael@0 775 palignr m0, m3, m2, 6
michael@0 776 mova [dstq ], m0
michael@0 777 mova [dstq+16 ], m7
michael@0 778 palignr m7, m4, m3, 4
michael@0 779 palignr m0, m3, m2, 4
michael@0 780 mova [dstq+strideq ], m0
michael@0 781 mova [dstq+strideq+16 ], m7
michael@0 782 palignr m7, m4, m3, 2
michael@0 783 palignr m0, m3, m2, 2
michael@0 784 mova [dstq+strideq*2 ], m0
michael@0 785 mova [dstq+strideq*2+16], m7
michael@0 786 mova [dstq+stride3q ], m2
michael@0 787 mova [dstq+stride3q+16 ], m3
michael@0 788
michael@0 789 RESTORE_GOT
michael@0 790 RET
michael@0 791
michael@0 792 INIT_MMX ssse3
michael@0 793 cglobal d207_predictor_4x4, 4, 5, 4, dst, stride, unused, left, goffset
michael@0 794 GET_GOT goffsetq
michael@0 795 movd m0, [leftq] ; abcd [byte]
michael@0 796 pshufb m1, m0, [GLOBAL(sh_b1233)] ; bcdd [byte]
michael@0 797 pshufb m3, m0, [GLOBAL(sh_b2333)] ; cddd
michael@0 798
michael@0 799 X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m3, m2
michael@0 800 pavgb m1, m0 ; ab, bc, cd, d [byte]
michael@0 801
michael@0 802 punpcklbw m1, m2 ; ab, a2bc, bc, b2cd, cd, c3d, d, d
michael@0 803 movd [dstq ], m1
michael@0 804 psrlq m1, 16 ; bc, b2cd, cd, c3d, d, d
michael@0 805 movd [dstq+strideq], m1
michael@0 806 lea dstq, [dstq+strideq*2]
michael@0 807 psrlq m1, 16 ; cd, c3d, d, d
michael@0 808 movd [dstq ], m1
michael@0 809 pshufw m1, m1, q1111 ; d, d, d, d
michael@0 810 movd [dstq+strideq], m1
michael@0 811 RESTORE_GOT
michael@0 812 RET
michael@0 813
michael@0 814 INIT_XMM ssse3
michael@0 815 cglobal d207_predictor_8x8, 4, 5, 4, dst, stride, stride3, left, goffset
michael@0 816 GET_GOT goffsetq
michael@0 817 movq m3, [leftq] ; abcdefgh [byte]
michael@0 818 lea stride3q, [strideq*3]
michael@0 819
michael@0 820 pshufb m1, m3, [GLOBAL(sh_b2345677777777777)]
michael@0 821 pshufb m0, m3, [GLOBAL(sh_b0123456777777777)]
michael@0 822 pshufb m2, m3, [GLOBAL(sh_b1234567777777777)]
michael@0 823
michael@0 824 X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m2, m1, m3
michael@0 825 pavgb m0, m2
michael@0 826 punpcklbw m0, m3 ; interleaved output
michael@0 827
michael@0 828 movq [dstq ], m0
michael@0 829 psrldq m0, 2
michael@0 830 movq [dstq+strideq ], m0
michael@0 831 psrldq m0, 2
michael@0 832 movq [dstq+strideq*2], m0
michael@0 833 psrldq m0, 2
michael@0 834 movq [dstq+stride3q ], m0
michael@0 835 lea dstq, [dstq+strideq*4]
michael@0 836 pshufhw m0, m0, q0000 ; de, d2ef, ef, e2fg, fg, f2gh, gh, g3h, 8xh
michael@0 837 psrldq m0, 2
michael@0 838 movq [dstq ], m0
michael@0 839 psrldq m0, 2
michael@0 840 movq [dstq+strideq ], m0
michael@0 841 psrldq m0, 2
michael@0 842 movq [dstq+strideq*2], m0
michael@0 843 psrldq m0, 2
michael@0 844 movq [dstq+stride3q ], m0
michael@0 845 RESTORE_GOT
michael@0 846 RET
michael@0 847
michael@0 848 INIT_XMM ssse3
michael@0 849 cglobal d207_predictor_16x16, 4, 5, 5, dst, stride, stride3, left, goffset
michael@0 850 GET_GOT goffsetq
michael@0 851 lea stride3q, [strideq*3]
michael@0 852 mova m0, [leftq] ; abcdefghijklmnop [byte]
michael@0 853 pshufb m1, m0, [GLOBAL(sh_b123456789abcdeff)] ; bcdefghijklmnopp
michael@0 854 pshufb m2, m0, [GLOBAL(sh_b23456789abcdefff)]
michael@0 855
michael@0 856 X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3
michael@0 857 pavgb m1, m0 ; ab, bc, cd .. no, op, pp [byte]
michael@0 858
michael@0 859 punpckhbw m4, m1, m3 ; interleaved input
michael@0 860 punpcklbw m1, m3 ; interleaved output
michael@0 861 mova [dstq ], m1
michael@0 862 palignr m3, m4, m1, 2
michael@0 863 mova [dstq+strideq ], m3
michael@0 864 palignr m3, m4, m1, 4
michael@0 865 mova [dstq+strideq*2], m3
michael@0 866 palignr m3, m4, m1, 6
michael@0 867 mova [dstq+stride3q ], m3
michael@0 868 lea dstq, [dstq+strideq*4]
michael@0 869 palignr m3, m4, m1, 8
michael@0 870 mova [dstq ], m3
michael@0 871 palignr m3, m4, m1, 10
michael@0 872 mova [dstq+strideq ], m3
michael@0 873 palignr m3, m4, m1, 12
michael@0 874 mova [dstq+strideq*2], m3
michael@0 875 palignr m3, m4, m1, 14
michael@0 876 mova [dstq+stride3q ], m3
michael@0 877 DEFINE_ARGS dst, stride, stride3, line
michael@0 878 mov lined, 2
michael@0 879 mova m0, [GLOBAL(sh_b23456789abcdefff)]
michael@0 880 .loop:
michael@0 881 lea dstq, [dstq+strideq*4]
michael@0 882 mova [dstq ], m4
michael@0 883 pshufb m4, m0
michael@0 884 mova [dstq+strideq ], m4
michael@0 885 pshufb m4, m0
michael@0 886 mova [dstq+strideq*2], m4
michael@0 887 pshufb m4, m0
michael@0 888 mova [dstq+stride3q ], m4
michael@0 889 pshufb m4, m0
michael@0 890 dec lined
michael@0 891 jnz .loop
michael@0 892 RESTORE_GOT
michael@0 893 REP_RET
michael@0 894
michael@0 895 INIT_XMM ssse3
michael@0 896 cglobal d207_predictor_32x32, 4, 5, 8, dst, stride, stride3, left, goffset
michael@0 897 GET_GOT goffsetq
michael@0 898 lea stride3q, [strideq*3]
michael@0 899 mova m1, [leftq] ; 0-15 [byte]
michael@0 900 mova m2, [leftq+16] ; 16-31 [byte]
michael@0 901 pshufb m0, m2, [GLOBAL(sh_b23456789abcdefff)]
michael@0 902 pshufb m4, m2, [GLOBAL(sh_b123456789abcdeff)]
michael@0 903
michael@0 904 X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m2, m4, m0, m3
michael@0 905 palignr m6, m2, m1, 1
michael@0 906 palignr m5, m2, m1, 2
michael@0 907 pavgb m2, m4 ; high 16px even lines
michael@0 908
michael@0 909 X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m1, m6, m5, m0
michael@0 910 pavgb m1, m6 ; low 16px even lines
michael@0 911
michael@0 912 punpckhbw m6, m1, m0 ; interleaved output 2
michael@0 913 punpcklbw m1, m0 ; interleaved output 1
michael@0 914
michael@0 915 punpckhbw m7, m2, m3 ; interleaved output 4
michael@0 916 punpcklbw m2, m3 ; interleaved output 3
michael@0 917
michael@0 918 ; output 1st 8 lines (and half of 2nd 8 lines)
michael@0 919 DEFINE_ARGS dst, stride, stride3, dst8
michael@0 920 lea dst8q, [dstq+strideq*8]
michael@0 921 mova [dstq ], m1
michael@0 922 mova [dstq +16], m6
michael@0 923 mova [dst8q ], m6
michael@0 924 palignr m0, m6, m1, 2
michael@0 925 palignr m4, m2, m6, 2
michael@0 926 mova [dstq +strideq ], m0
michael@0 927 mova [dstq +strideq +16], m4
michael@0 928 mova [dst8q+strideq ], m4
michael@0 929 palignr m0, m6, m1, 4
michael@0 930 palignr m4, m2, m6, 4
michael@0 931 mova [dstq +strideq*2 ], m0
michael@0 932 mova [dstq +strideq*2+16], m4
michael@0 933 mova [dst8q+strideq*2 ], m4
michael@0 934 palignr m0, m6, m1, 6
michael@0 935 palignr m4, m2, m6, 6
michael@0 936 mova [dstq +stride3q ], m0
michael@0 937 mova [dstq +stride3q +16], m4
michael@0 938 mova [dst8q+stride3q ], m4
michael@0 939 lea dstq, [dstq +strideq*4]
michael@0 940 lea dst8q, [dst8q+strideq*4]
michael@0 941 palignr m0, m6, m1, 8
michael@0 942 palignr m4, m2, m6, 8
michael@0 943 mova [dstq ], m0
michael@0 944 mova [dstq +16], m4
michael@0 945 mova [dst8q ], m4
michael@0 946 palignr m0, m6, m1, 10
michael@0 947 palignr m4, m2, m6, 10
michael@0 948 mova [dstq +strideq ], m0
michael@0 949 mova [dstq +strideq +16], m4
michael@0 950 mova [dst8q+strideq ], m4
michael@0 951 palignr m0, m6, m1, 12
michael@0 952 palignr m4, m2, m6, 12
michael@0 953 mova [dstq +strideq*2 ], m0
michael@0 954 mova [dstq +strideq*2+16], m4
michael@0 955 mova [dst8q+strideq*2 ], m4
michael@0 956 palignr m0, m6, m1, 14
michael@0 957 palignr m4, m2, m6, 14
michael@0 958 mova [dstq +stride3q ], m0
michael@0 959 mova [dstq +stride3q +16], m4
michael@0 960 mova [dst8q+stride3q ], m4
michael@0 961 lea dstq, [dstq+strideq*4]
michael@0 962 lea dst8q, [dst8q+strideq*4]
michael@0 963
michael@0 964 ; output 2nd half of 2nd 8 lines and half of 3rd 8 lines
michael@0 965 mova [dstq +16], m2
michael@0 966 mova [dst8q ], m2
michael@0 967 palignr m4, m7, m2, 2
michael@0 968 mova [dstq +strideq +16], m4
michael@0 969 mova [dst8q+strideq ], m4
michael@0 970 palignr m4, m7, m2, 4
michael@0 971 mova [dstq +strideq*2+16], m4
michael@0 972 mova [dst8q+strideq*2 ], m4
michael@0 973 palignr m4, m7, m2, 6
michael@0 974 mova [dstq +stride3q +16], m4
michael@0 975 mova [dst8q+stride3q ], m4
michael@0 976 lea dstq, [dstq+strideq*4]
michael@0 977 lea dst8q, [dst8q+strideq*4]
michael@0 978 palignr m4, m7, m2, 8
michael@0 979 mova [dstq +16], m4
michael@0 980 mova [dst8q ], m4
michael@0 981 palignr m4, m7, m2, 10
michael@0 982 mova [dstq +strideq +16], m4
michael@0 983 mova [dst8q+strideq ], m4
michael@0 984 palignr m4, m7, m2, 12
michael@0 985 mova [dstq +strideq*2+16], m4
michael@0 986 mova [dst8q+strideq*2 ], m4
michael@0 987 palignr m4, m7, m2, 14
michael@0 988 mova [dstq +stride3q +16], m4
michael@0 989 mova [dst8q+stride3q ], m4
michael@0 990 lea dstq, [dstq+strideq*4]
michael@0 991 lea dst8q, [dst8q+strideq*4]
michael@0 992
michael@0 993 ; output 2nd half of 3rd 8 lines and half of 4th 8 lines
michael@0 994 mova m0, [GLOBAL(sh_b23456789abcdefff)]
michael@0 995 mova [dstq +16], m7
michael@0 996 mova [dst8q ], m7
michael@0 997 pshufb m7, m0
michael@0 998 mova [dstq +strideq +16], m7
michael@0 999 mova [dst8q+strideq ], m7
michael@0 1000 pshufb m7, m0
michael@0 1001 mova [dstq +strideq*2+16], m7
michael@0 1002 mova [dst8q+strideq*2 ], m7
michael@0 1003 pshufb m7, m0
michael@0 1004 mova [dstq +stride3q +16], m7
michael@0 1005 mova [dst8q+stride3q ], m7
michael@0 1006 pshufb m7, m0
michael@0 1007 lea dstq, [dstq+strideq*4]
michael@0 1008 lea dst8q, [dst8q+strideq*4]
michael@0 1009 mova [dstq +16], m7
michael@0 1010 mova [dst8q ], m7
michael@0 1011 pshufb m7, m0
michael@0 1012 mova [dstq +strideq +16], m7
michael@0 1013 mova [dst8q+strideq ], m7
michael@0 1014 pshufb m7, m0
michael@0 1015 mova [dstq +strideq*2+16], m7
michael@0 1016 mova [dst8q+strideq*2 ], m7
michael@0 1017 pshufb m7, m0
michael@0 1018 mova [dstq +stride3q +16], m7
michael@0 1019 mova [dst8q+stride3q ], m7
michael@0 1020 pshufb m7, m0
michael@0 1021 lea dstq, [dstq+strideq*4]
michael@0 1022
michael@0 1023 ; output last half of 4th 8 lines
michael@0 1024 mova [dstq +16], m7
michael@0 1025 mova [dstq +strideq +16], m7
michael@0 1026 mova [dstq +strideq*2+16], m7
michael@0 1027 mova [dstq +stride3q +16], m7
michael@0 1028 lea dstq, [dstq+strideq*4]
michael@0 1029 mova [dstq +16], m7
michael@0 1030 mova [dstq +strideq +16], m7
michael@0 1031 mova [dstq +strideq*2+16], m7
michael@0 1032 mova [dstq +stride3q +16], m7
michael@0 1033
michael@0 1034 ; done!
michael@0 1035 RESTORE_GOT
michael@0 1036 RET

mercurial