media/libvpx/vp9/common/x86/vp9_intrapred_ssse3.asm

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

     1 ;
     2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
     3 ;
     4 ;  Use of this source code is governed by a BSD-style license
     5 ;  that can be found in the LICENSE file in the root of the source
     6 ;  tree. An additional intellectual property rights grant can be found
     7 ;  in the file PATENTS.  All contributing project authors may
     8 ;  be found in the AUTHORS file in the root of the source tree.
     9 ;
    11 %include "third_party/x86inc/x86inc.asm"
    13 SECTION_RODATA
    15 pb_1: times 16 db 1
    16 sh_b01234577: db 0, 1, 2, 3, 4, 5, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0
    17 sh_b12345677: db 1, 2, 3, 4, 5, 6, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0
    18 sh_b23456777: db 2, 3, 4, 5, 6, 7, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0
    19 sh_b0123456777777777: db 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7
    20 sh_b1234567777777777: db 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7
    21 sh_b2345677777777777: db 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7
    22 sh_b123456789abcdeff: db 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15
    23 sh_b23456789abcdefff: db 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 15
    24 sh_b32104567: db 3, 2, 1, 0, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0
    25 sh_b8091a2b345: db 8, 0, 9, 1, 10, 2, 11, 3, 4, 5, 0, 0, 0, 0, 0, 0
    26 sh_b76543210: db 7, 6, 5, 4, 3, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0
    27 sh_b65432108: db 6, 5, 4, 3, 2, 1, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0
    28 sh_b54321089: db 5, 4, 3, 2, 1, 0, 8, 9, 0, 0, 0, 0, 0, 0, 0, 0
    29 sh_b89abcdef: db 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0
    30 sh_bfedcba9876543210: db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
    31 sh_b1233: db 1, 2, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
    32 sh_b2333: db 2, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
    34 SECTION .text
    36 INIT_MMX ssse3
    37 cglobal h_predictor_4x4, 2, 4, 3, dst, stride, line, left
    38   movifnidn          leftq, leftmp
    39   add                leftq, 4
    40   mov                lineq, -2
    41   pxor                  m0, m0
    42 .loop:
    43   movd                  m1, [leftq+lineq*2  ]
    44   movd                  m2, [leftq+lineq*2+1]
    45   pshufb                m1, m0
    46   pshufb                m2, m0
    47   movd      [dstq        ], m1
    48   movd      [dstq+strideq], m2
    49   lea                 dstq, [dstq+strideq*2]
    50   inc                lineq
    51   jnz .loop
    52   REP_RET
    54 INIT_MMX ssse3
    55 cglobal h_predictor_8x8, 2, 4, 3, dst, stride, line, left
    56   movifnidn          leftq, leftmp
    57   add                leftq, 8
    58   mov                lineq, -4
    59   pxor                  m0, m0
    60 .loop:
    61   movd                  m1, [leftq+lineq*2  ]
    62   movd                  m2, [leftq+lineq*2+1]
    63   pshufb                m1, m0
    64   pshufb                m2, m0
    65   movq      [dstq        ], m1
    66   movq      [dstq+strideq], m2
    67   lea                 dstq, [dstq+strideq*2]
    68   inc                lineq
    69   jnz .loop
    70   REP_RET
    72 INIT_XMM ssse3
    73 cglobal h_predictor_16x16, 2, 4, 3, dst, stride, line, left
    74   movifnidn          leftq, leftmp
    75   add                leftq, 16
    76   mov                lineq, -8
    77   pxor                  m0, m0
    78 .loop:
    79   movd                  m1, [leftq+lineq*2  ]
    80   movd                  m2, [leftq+lineq*2+1]
    81   pshufb                m1, m0
    82   pshufb                m2, m0
    83   mova      [dstq        ], m1
    84   mova      [dstq+strideq], m2
    85   lea                 dstq, [dstq+strideq*2]
    86   inc                lineq
    87   jnz .loop
    88   REP_RET
    90 INIT_XMM ssse3
    91 cglobal h_predictor_32x32, 2, 4, 3, dst, stride, line, left
    92   movifnidn          leftq, leftmp
    93   add                leftq, 32
    94   mov                lineq, -16
    95   pxor                  m0, m0
    96 .loop:
    97   movd                  m1, [leftq+lineq*2  ]
    98   movd                  m2, [leftq+lineq*2+1]
    99   pshufb                m1, m0
   100   pshufb                m2, m0
   101   mova   [dstq           ], m1
   102   mova   [dstq        +16], m1
   103   mova   [dstq+strideq   ], m2
   104   mova   [dstq+strideq+16], m2
   105   lea                 dstq, [dstq+strideq*2]
   106   inc                lineq
   107   jnz .loop
   108   REP_RET
   110 INIT_MMX ssse3
   111 cglobal d45_predictor_4x4, 3, 4, 4, dst, stride, above, goffset
   112   GET_GOT     goffsetq
   114   movq                m0, [aboveq]
   115   pshufb              m2, m0, [GLOBAL(sh_b23456777)]
   116   pshufb              m1, m0, [GLOBAL(sh_b01234577)]
   117   pshufb              m0, [GLOBAL(sh_b12345677)]
   118   pavgb               m3, m2, m1
   119   pxor                m2, m1
   120   pand                m2, [GLOBAL(pb_1)]
   121   psubb               m3, m2
   122   pavgb               m0, m3
   124   ; store 4 lines
   125   movd    [dstq        ], m0
   126   psrlq               m0, 8
   127   movd    [dstq+strideq], m0
   128   lea               dstq, [dstq+strideq*2]
   129   psrlq               m0, 8
   130   movd    [dstq        ], m0
   131   psrlq               m0, 8
   132   movd    [dstq+strideq], m0
   134   RESTORE_GOT
   135   RET
   137 INIT_MMX ssse3
   138 cglobal d45_predictor_8x8, 3, 4, 4, dst, stride, above, goffset
   139   GET_GOT     goffsetq
   141   movq                m0, [aboveq]
   142   mova                m1, [GLOBAL(sh_b12345677)]
   143   DEFINE_ARGS dst, stride, stride3
   144   lea           stride3q, [strideq*3]
   145   pshufb              m2, m0, [GLOBAL(sh_b23456777)]
   146   pavgb               m3, m2, m0
   147   pxor                m2, m0
   148   pshufb              m0, m1
   149   pand                m2, [GLOBAL(pb_1)]
   150   psubb               m3, m2
   151   pavgb               m0, m3
   153   ; store 4 lines
   154   movq  [dstq          ], m0
   155   pshufb              m0, m1
   156   movq  [dstq+strideq  ], m0
   157   pshufb              m0, m1
   158   movq  [dstq+strideq*2], m0
   159   pshufb              m0, m1
   160   movq  [dstq+stride3q ], m0
   161   pshufb              m0, m1
   162   lea               dstq, [dstq+strideq*4]
   164   ; store next 4 lines
   165   movq  [dstq          ], m0
   166   pshufb              m0, m1
   167   movq  [dstq+strideq  ], m0
   168   pshufb              m0, m1
   169   movq  [dstq+strideq*2], m0
   170   pshufb              m0, m1
   171   movq  [dstq+stride3q ], m0
   173   RESTORE_GOT
   174   RET
   176 INIT_XMM ssse3
   177 cglobal d45_predictor_16x16, 3, 6, 4, dst, stride, above, dst8, line, goffset
   178   GET_GOT     goffsetq
   180   mova                   m0, [aboveq]
   181   DEFINE_ARGS dst, stride, stride3, dst8, line
   182   lea              stride3q, [strideq*3]
   183   lea                 dst8q, [dstq+strideq*8]
   184   mova                   m1, [GLOBAL(sh_b123456789abcdeff)]
   185   pshufb                 m2, m0, [GLOBAL(sh_b23456789abcdefff)]
   186   pavgb                  m3, m2, m0
   187   pxor                   m2, m0
   188   pshufb                 m0, m1
   189   pand                   m2, [GLOBAL(pb_1)]
   190   psubb                  m3, m2
   191   pavgb                  m0, m3
   193   ; first 4 lines and first half of 3rd 4 lines
   194   mov                 lined, 2
   195 .loop:
   196   mova   [dstq            ], m0
   197   movhps [dst8q           ], m0
   198   pshufb                 m0, m1
   199   mova   [dstq +strideq   ], m0
   200   movhps [dst8q+strideq   ], m0
   201   pshufb                 m0, m1
   202   mova   [dstq +strideq*2 ], m0
   203   movhps [dst8q+strideq*2 ], m0
   204   pshufb                 m0, m1
   205   mova   [dstq +stride3q  ], m0
   206   movhps [dst8q+stride3q  ], m0
   207   pshufb                 m0, m1
   208   lea                  dstq, [dstq +strideq*4]
   209   lea                 dst8q, [dst8q+strideq*4]
   210   dec                 lined
   211   jnz .loop
   213   ; bottom-right 8x8 block
   214   movhps [dstq          +8], m0
   215   movhps [dstq+strideq  +8], m0
   216   movhps [dstq+strideq*2+8], m0
   217   movhps [dstq+stride3q +8], m0
   218   lea                  dstq, [dstq+strideq*4]
   219   movhps [dstq          +8], m0
   220   movhps [dstq+strideq  +8], m0
   221   movhps [dstq+strideq*2+8], m0
   222   movhps [dstq+stride3q +8], m0
   224   RESTORE_GOT
   225   RET
   227 INIT_XMM ssse3
   228 cglobal d45_predictor_32x32, 3, 6, 7, dst, stride, above, dst16, line, goffset
   229   GET_GOT     goffsetq
   231   mova                   m0, [aboveq]
   232   mova                   m4, [aboveq+16]
   233   DEFINE_ARGS dst, stride, stride3, dst16, line
   234   lea              stride3q, [strideq*3]
   235   lea                dst16q, [dstq  +strideq*8]
   236   lea                dst16q, [dst16q+strideq*8]
   237   mova                   m1, [GLOBAL(sh_b123456789abcdeff)]
   238   pshufb                 m2, m4, [GLOBAL(sh_b23456789abcdefff)]
   239   pavgb                  m3, m2, m4
   240   pxor                   m2, m4
   241   palignr                m5, m4, m0, 1
   242   palignr                m6, m4, m0, 2
   243   pshufb                 m4, m1
   244   pand                   m2, [GLOBAL(pb_1)]
   245   psubb                  m3, m2
   246   pavgb                  m4, m3
   247   pavgb                  m3, m0, m6
   248   pxor                   m0, m6
   249   pand                   m0, [GLOBAL(pb_1)]
   250   psubb                  m3, m0
   251   pavgb                  m5, m3
   253   ; write 4x4 lines (and the first half of the second 4x4 lines)
   254   mov                  lined, 4
   255 .loop:
   256   mova [dstq               ], m5
   257   mova [dstq            +16], m4
   258   mova [dst16q             ], m4
   259   palignr                 m3, m4, m5, 1
   260   pshufb                  m4, m1
   261   mova [dstq  +strideq     ], m3
   262   mova [dstq  +strideq  +16], m4
   263   mova [dst16q+strideq     ], m4
   264   palignr                 m5, m4, m3, 1
   265   pshufb                  m4, m1
   266   mova [dstq  +strideq*2   ], m5
   267   mova [dstq  +strideq*2+16], m4
   268   mova [dst16q+strideq*2   ], m4
   269   palignr                 m3, m4, m5, 1
   270   pshufb                  m4, m1
   271   mova [dstq  +stride3q    ], m3
   272   mova [dstq  +stride3q +16], m4
   273   mova [dst16q+stride3q    ], m4
   274   palignr                 m5, m4, m3, 1
   275   pshufb                  m4, m1
   276   lea                  dstq, [dstq  +strideq*4]
   277   lea                dst16q, [dst16q+strideq*4]
   278   dec                 lined
   279   jnz .loop
   281   ; write second half of second 4x4 lines
   282   mova [dstq            +16], m4
   283   mova [dstq  +strideq  +16], m4
   284   mova [dstq  +strideq*2+16], m4
   285   mova [dstq  +stride3q +16], m4
   286   lea                  dstq, [dstq  +strideq*4]
   287   mova [dstq            +16], m4
   288   mova [dstq  +strideq  +16], m4
   289   mova [dstq  +strideq*2+16], m4
   290   mova [dstq  +stride3q +16], m4
   291   lea                  dstq, [dstq  +strideq*4]
   292   mova [dstq            +16], m4
   293   mova [dstq  +strideq  +16], m4
   294   mova [dstq  +strideq*2+16], m4
   295   mova [dstq  +stride3q +16], m4
   296   lea                  dstq, [dstq  +strideq*4]
   297   mova [dstq            +16], m4
   298   mova [dstq  +strideq  +16], m4
   299   mova [dstq  +strideq*2+16], m4
   300   mova [dstq  +stride3q +16], m4
   302   RESTORE_GOT
   303   RET
   305 ; ------------------------------------------
   306 ; input: x, y, z, result
   307 ;
   308 ; trick from pascal
   309 ; (x+2y+z+2)>>2 can be calculated as:
   310 ; result = avg(x,z)
   311 ; result -= xor(x,z) & 1
   312 ; result = avg(result,y)
   313 ; ------------------------------------------
   314 %macro X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 4
   315   pavgb               %4, %1, %3
   316   pxor                %3, %1
   317   pand                %3, [GLOBAL(pb_1)]
   318   psubb               %4, %3
   319   pavgb               %4, %2
   320 %endmacro
   322 INIT_XMM ssse3
   323 cglobal d63_predictor_4x4, 3, 4, 5, dst, stride, above, goffset
   324   GET_GOT     goffsetq
   326   movq                m3, [aboveq]
   327   pshufb              m1, m3, [GLOBAL(sh_b23456777)]
   328   pshufb              m2, m3, [GLOBAL(sh_b12345677)]
   330   X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m3, m2, m1, m4
   331   pavgb               m3, m2
   333   ; store 4 lines
   334   movd    [dstq        ], m3
   335   movd    [dstq+strideq], m4
   336   lea               dstq, [dstq+strideq*2]
   337   psrldq              m3, 1
   338   psrldq              m4, 1
   339   movd    [dstq        ], m3
   340   movd    [dstq+strideq], m4
   341   RESTORE_GOT
   342   RET
   344 INIT_XMM ssse3
   345 cglobal d63_predictor_8x8, 3, 4, 5, dst, stride, above, goffset
   346   GET_GOT     goffsetq
   348   movq                m3, [aboveq]
   349   DEFINE_ARGS dst, stride, stride3
   350   lea           stride3q, [strideq*3]
   351   pshufb              m1, m3, [GLOBAL(sh_b2345677777777777)]
   352   pshufb              m0, m3, [GLOBAL(sh_b0123456777777777)]
   353   pshufb              m2, m3, [GLOBAL(sh_b1234567777777777)]
   354   pshufb              m3, [GLOBAL(sh_b0123456777777777)]
   356   X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m2, m1, m4
   357   pavgb               m3, m2
   359   ; store 4 lines
   360   movq    [dstq        ], m3
   361   movq    [dstq+strideq], m4
   362   psrldq              m3, 1
   363   psrldq              m4, 1
   364   movq  [dstq+strideq*2], m3
   365   movq  [dstq+stride3q ], m4
   366   lea               dstq, [dstq+strideq*4]
   367   psrldq              m3, 1
   368   psrldq              m4, 1
   370   ; store 4 lines
   371   movq    [dstq        ], m3
   372   movq    [dstq+strideq], m4
   373   psrldq              m3, 1
   374   psrldq              m4, 1
   375   movq  [dstq+strideq*2], m3
   376   movq  [dstq+stride3q ], m4
   377   RESTORE_GOT
   378   RET
   380 INIT_XMM ssse3
   381 cglobal d63_predictor_16x16, 3, 5, 5, dst, stride, above, line, goffset
   382   GET_GOT     goffsetq
   384   mova                m0, [aboveq]
   385   DEFINE_ARGS dst, stride, stride3, line
   386   lea           stride3q, [strideq*3]
   387   mova                m1, [GLOBAL(sh_b123456789abcdeff)]
   388   pshufb              m2, m0, [GLOBAL(sh_b23456789abcdefff)]
   389   pshufb              m3, m0, m1
   391   X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m3, m2, m4
   392   pavgb               m0, m3
   394   mov              lined, 4
   395 .loop:
   396   mova  [dstq          ], m0
   397   mova  [dstq+strideq  ], m4
   398   pshufb              m0, m1
   399   pshufb              m4, m1
   400   mova  [dstq+strideq*2], m0
   401   mova  [dstq+stride3q ], m4
   402   pshufb              m0, m1
   403   pshufb              m4, m1
   404   lea               dstq, [dstq+strideq*4]
   405   dec              lined
   406   jnz .loop
   407   RESTORE_GOT
   408   REP_RET
   410 INIT_XMM ssse3
   411 cglobal d63_predictor_32x32, 3, 5, 8, dst, stride, above, line, goffset
   412   GET_GOT     goffsetq
   414   mova                   m0, [aboveq]
   415   mova                   m7, [aboveq+16]
   416   DEFINE_ARGS dst, stride, stride3, line
   417   mova                   m1, [GLOBAL(sh_b123456789abcdeff)]
   418   lea              stride3q, [strideq*3]
   419   pshufb                 m2, m7, [GLOBAL(sh_b23456789abcdefff)]
   420   pshufb                 m3, m7, m1
   422   X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m7, m3, m2, m4
   423   palignr                m6, m7, m0, 1
   424   palignr                m5, m7, m0, 2
   425   pavgb                  m7, m3
   427   X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m6, m5, m2
   428   pavgb                  m0, m6
   430   mov                 lined, 8
   431 .loop:
   432   mova  [dstq             ], m0
   433   mova  [dstq          +16], m7
   434   mova  [dstq+strideq     ], m2
   435   mova  [dstq+strideq  +16], m4
   436   palignr                m3, m7, m0, 1
   437   palignr                m5, m4, m2, 1
   438   pshufb                 m7, m1
   439   pshufb                 m4, m1
   441   mova  [dstq+strideq*2   ], m3
   442   mova  [dstq+strideq*2+16], m7
   443   mova  [dstq+stride3q    ], m5
   444   mova  [dstq+stride3q +16], m4
   445   palignr                m0, m7, m3, 1
   446   palignr                m2, m4, m5, 1
   447   pshufb                 m7, m1
   448   pshufb                 m4, m1
   449   lea                  dstq, [dstq+strideq*4]
   450   dec                 lined
   451   jnz .loop
   452   RESTORE_GOT
   453   REP_RET
   455 INIT_XMM ssse3
   456 cglobal d153_predictor_4x4, 4, 5, 4, dst, stride, above, left, goffset
   457   GET_GOT     goffsetq
   458   movd                m0, [leftq]               ; l1, l2, l3, l4
   459   movd                m1, [aboveq-1]            ; tl, t1, t2, t3
   460   punpckldq           m0, m1                    ; l1, l2, l3, l4, tl, t1, t2, t3
   461   pshufb              m0, [GLOBAL(sh_b32104567)]; l4, l3, l2, l1, tl, t1, t2, t3
   462   psrldq              m1, m0, 1                 ; l3, l2, l1, tl, t1, t2, t3
   463   psrldq              m2, m0, 2                 ; l2, l1, tl, t1, t2, t3
   464   ; comments below are for a predictor like this
   465   ; A1 B1 C1 D1
   466   ; A2 B2 A1 B1
   467   ; A3 B3 A2 B2
   468   ; A4 B4 A3 B3
   469   X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3  ; 3-tap avg B4 B3 B2 B1 C1 D1
   470   pavgb               m1, m0                    ; 2-tap avg A4 A3 A2 A1
   472   punpcklqdq          m3, m1                    ; B4 B3 B2 B1 C1 D1 x x A4 A3 A2 A1 ..
   474   DEFINE_ARGS dst, stride, stride3
   475   lea           stride3q, [strideq*3]
   476   pshufb              m3, [GLOBAL(sh_b8091a2b345)] ; A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 ..
   477   movd  [dstq+stride3q ], m3
   478   psrldq              m3, 2                     ; A3 B3 A2 B2 A1 B1 C1 D1 ..
   479   movd  [dstq+strideq*2], m3
   480   psrldq              m3, 2                     ; A2 B2 A1 B1 C1 D1 ..
   481   movd  [dstq+strideq  ], m3
   482   psrldq              m3, 2                     ; A1 B1 C1 D1 ..
   483   movd  [dstq          ], m3
   484   RESTORE_GOT
   485   RET
   487 INIT_XMM ssse3
   488 cglobal d153_predictor_8x8, 4, 5, 8, dst, stride, above, left, goffset
   489   GET_GOT     goffsetq
   490   movq                m0, [leftq]                     ; [0- 7] l1-8 [byte]
   491   movhps              m0, [aboveq-1]                  ; [8-15] tl, t1-7 [byte]
   492   pshufb              m1, m0, [GLOBAL(sh_b76543210)]  ; l8-1 [word]
   493   pshufb              m2, m0, [GLOBAL(sh_b65432108)]  ; l7-1,tl [word]
   494   pshufb              m3, m0, [GLOBAL(sh_b54321089)]  ; l6-1,tl,t1 [word]
   495   pshufb              m0, [GLOBAL(sh_b89abcdef)]      ; tl,t1-7 [word]
   496   psrldq              m4, m0, 1                       ; t1-7 [word]
   497   psrldq              m5, m0, 2                       ; t2-7 [word]
   498   ; comments below are for a predictor like this
   499   ; A1 B1 C1 D1 E1 F1 G1 H1
   500   ; A2 B2 A1 B1 C1 D1 E1 F1
   501   ; A3 B3 A2 B2 A1 B1 C1 D1
   502   ; A4 B4 A3 B3 A2 B2 A1 B1
   503   ; A5 B5 A4 B4 A3 B3 A2 B2
   504   ; A6 B6 A5 B5 A4 B4 A3 B3
   505   ; A7 B7 A6 B6 A5 B5 A4 B4
   506   ; A8 B8 A7 B7 A6 B6 A5 B5
   507   pavgb               m6, m1, m2                ; 2-tap avg A8-A1
   509   X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m4, m5, m7  ; 3-tap avg C-H1
   511   X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m1, m2, m3, m0  ; 3-tap avg B8-1
   513   punpcklbw           m6, m0                    ; A-B8, A-B7 ... A-B2, A-B1
   515   DEFINE_ARGS dst, stride, stride3
   516   lea           stride3q, [strideq*3]
   518   movhps [dstq+stride3q], m6                    ; A-B4, A-B3, A-B2, A-B1
   519   palignr             m0, m7, m6, 10            ; A-B3, A-B2, A-B1, C-H1
   520   movq  [dstq+strideq*2], m0
   521   psrldq              m0, 2                     ; A-B2, A-B1, C-H1
   522   movq  [dstq+strideq  ], m0
   523   psrldq              m0, 2                     ; A-H1
   524   movq  [dstq          ], m0
   525   lea               dstq, [dstq+strideq*4]
   526   movq  [dstq+stride3q ], m6                    ; A-B8, A-B7, A-B6, A-B5
   527   psrldq              m6, 2                     ; A-B7, A-B6, A-B5, A-B4
   528   movq  [dstq+strideq*2], m6
   529   psrldq              m6, 2                     ; A-B6, A-B5, A-B4, A-B3
   530   movq  [dstq+strideq  ], m6
   531   psrldq              m6, 2                     ; A-B5, A-B4, A-B3, A-B2
   532   movq  [dstq          ], m6
   533   RESTORE_GOT
   534   RET
   536 INIT_XMM ssse3
   537 cglobal d153_predictor_16x16, 4, 5, 8, dst, stride, above, left, goffset
   538   GET_GOT     goffsetq
   539   mova                m0, [leftq]
   540   movu                m7, [aboveq-1]
   541   ; comments below are for a predictor like this
   542   ; A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1 L1 M1 N1 O1 P1
   543   ; A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1 L1 M1 N1
   544   ; A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1 L1
   545   ; A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 I1 J1
   546   ; A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 G1 H1
   547   ; A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 E1 F1
   548   ; A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 C1 D1
   549   ; A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 A1 B1
   550   ; A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 A2 B2
   551   ; Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 A3 B3
   552   ; Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 A4 B4
   553   ; Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 A5 B5
   554   ; Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 A6 B6
   555   ; Ae Be Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 A7 B7
   556   ; Af Bf Ae Be Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 A8 B8
   557   ; Ag Bg Af Bf Ae Be Ad Bd Ac Bc Ab Bb Aa Ba A9 B9
   558   pshufb              m6, m7, [GLOBAL(sh_bfedcba9876543210)]
   559   palignr             m5, m0, m6, 15
   560   palignr             m3, m0, m6, 14
   562   X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m5, m3, m4          ; 3-tap avg B3-Bg
   563   pshufb              m1, m0, [GLOBAL(sh_b123456789abcdeff)]
   564   pavgb               m5, m0                            ; A1 - Ag
   566   punpcklbw           m0, m4, m5                        ; A-B8 ... A-B1
   567   punpckhbw           m4, m5                            ; A-B9 ... A-Bg
   569   pshufb              m3, m7, [GLOBAL(sh_b123456789abcdeff)]
   570   pshufb              m5, m7, [GLOBAL(sh_b23456789abcdefff)]
   572   X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m7, m3, m5, m1          ; 3-tap avg C1-P1
   574   pshufb              m6, m0, [GLOBAL(sh_bfedcba9876543210)]
   575   DEFINE_ARGS dst, stride, stride3
   576   lea           stride3q, [strideq*3]
   577   palignr             m2, m1, m6, 14
   578   mova  [dstq          ], m2
   579   palignr             m2, m1, m6, 12
   580   mova  [dstq+strideq  ], m2
   581   palignr             m2, m1, m6, 10
   582   mova  [dstq+strideq*2], m2
   583   palignr             m2, m1, m6, 8
   584   mova  [dstq+stride3q ], m2
   585   lea               dstq, [dstq+strideq*4]
   586   palignr             m2, m1, m6, 6
   587   mova  [dstq          ], m2
   588   palignr             m2, m1, m6, 4
   589   mova  [dstq+strideq  ], m2
   590   palignr             m2, m1, m6, 2
   591   mova  [dstq+strideq*2], m2
   592   pshufb              m4, [GLOBAL(sh_bfedcba9876543210)]
   593   mova  [dstq+stride3q ], m6
   594   lea               dstq, [dstq+strideq*4]
   596   palignr             m2, m6, m4, 14
   597   mova  [dstq          ], m2
   598   palignr             m2, m6, m4, 12
   599   mova  [dstq+strideq  ], m2
   600   palignr             m2, m6, m4, 10
   601   mova  [dstq+strideq*2], m2
   602   palignr             m2, m6, m4, 8
   603   mova  [dstq+stride3q ], m2
   604   lea               dstq, [dstq+strideq*4]
   605   palignr             m2, m6, m4, 6
   606   mova  [dstq          ], m2
   607   palignr             m2, m6, m4, 4
   608   mova  [dstq+strideq  ], m2
   609   palignr             m2, m6, m4, 2
   610   mova  [dstq+strideq*2], m2
   611   mova  [dstq+stride3q ], m4
   612   RESTORE_GOT
   613   RET
   615 INIT_XMM ssse3
   616 cglobal d153_predictor_32x32, 4, 5, 8, dst, stride, above, left, goffset
   617   GET_GOT     goffsetq
   618   mova                  m0, [leftq]
   619   movu                  m7, [aboveq-1]
   620   movu                  m1, [aboveq+15]
   622   pshufb                m4, m1, [GLOBAL(sh_b123456789abcdeff)]
   623   pshufb                m6, m1, [GLOBAL(sh_b23456789abcdefff)]
   625   X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m1, m4, m6, m2          ; 3-tap avg above [high]
   627   palignr               m3, m1, m7, 1
   628   palignr               m5, m1, m7, 2
   630   X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m7, m3, m5, m1          ; 3-tap avg above [low]
   632   pshufb                m7, [GLOBAL(sh_bfedcba9876543210)]
   633   palignr               m5, m0, m7, 15
   634   palignr               m3, m0, m7, 14
   636   X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m5, m3, m4          ; 3-tap avg B3-Bg
   637   pavgb                 m5, m0                            ; A1 - Ag
   638   punpcklbw             m6, m4, m5                        ; A-B8 ... A-B1
   639   punpckhbw             m4, m5                            ; A-B9 ... A-Bg
   640   pshufb                m6, [GLOBAL(sh_bfedcba9876543210)]
   641   pshufb                m4, [GLOBAL(sh_bfedcba9876543210)]
   643   DEFINE_ARGS dst, stride, stride3, left, line
   644   lea             stride3q, [strideq*3]
   646   palignr               m5, m2, m1, 14
   647   palignr               m7, m1, m6, 14
   648   mova  [dstq            ], m7
   649   mova  [dstq+16         ], m5
   650   palignr               m5, m2, m1, 12
   651   palignr               m7, m1, m6, 12
   652   mova  [dstq+strideq    ], m7
   653   mova  [dstq+strideq+16 ], m5
   654   palignr                m5, m2, m1, 10
   655   palignr                m7, m1, m6, 10
   656   mova  [dstq+strideq*2   ], m7
   657   mova  [dstq+strideq*2+16], m5
   658   palignr                m5, m2, m1, 8
   659   palignr                m7, m1, m6, 8
   660   mova  [dstq+stride3q    ], m7
   661   mova  [dstq+stride3q+16 ], m5
   662   lea                  dstq, [dstq+strideq*4]
   663   palignr                m5, m2, m1, 6
   664   palignr                m7, m1, m6, 6
   665   mova  [dstq             ], m7
   666   mova  [dstq+16          ], m5
   667   palignr                m5, m2, m1, 4
   668   palignr                m7, m1, m6, 4
   669   mova  [dstq+strideq     ], m7
   670   mova  [dstq+strideq+16  ], m5
   671   palignr                m5, m2, m1, 2
   672   palignr                m7, m1, m6, 2
   673   mova  [dstq+strideq*2   ], m7
   674   mova  [dstq+strideq*2+16], m5
   675   mova  [dstq+stride3q    ], m6
   676   mova  [dstq+stride3q+16 ], m1
   677   lea                  dstq, [dstq+strideq*4]
   679   palignr                m5, m1, m6, 14
   680   palignr                m3, m6, m4, 14
   681   mova  [dstq             ], m3
   682   mova  [dstq+16          ], m5
   683   palignr                m5, m1, m6, 12
   684   palignr                m3, m6, m4, 12
   685   mova  [dstq+strideq     ], m3
   686   mova  [dstq+strideq+16  ], m5
   687   palignr                m5, m1, m6, 10
   688   palignr                m3, m6, m4, 10
   689   mova  [dstq+strideq*2   ], m3
   690   mova  [dstq+strideq*2+16], m5
   691   palignr                m5, m1, m6, 8
   692   palignr                m3, m6, m4, 8
   693   mova  [dstq+stride3q    ], m3
   694   mova  [dstq+stride3q+16 ], m5
   695   lea                  dstq, [dstq+strideq*4]
   696   palignr                m5, m1, m6, 6
   697   palignr                m3, m6, m4, 6
   698   mova  [dstq             ], m3
   699   mova  [dstq+16          ], m5
   700   palignr                m5, m1, m6, 4
   701   palignr                m3, m6, m4, 4
   702   mova  [dstq+strideq     ], m3
   703   mova  [dstq+strideq+16  ], m5
   704   palignr                m5, m1, m6, 2
   705   palignr                m3, m6, m4, 2
   706   mova  [dstq+strideq*2   ], m3
   707   mova  [dstq+strideq*2+16], m5
   708   mova  [dstq+stride3q    ], m4
   709   mova  [dstq+stride3q+16 ], m6
   710   lea               dstq, [dstq+strideq*4]
   712   mova                   m7, [leftq]
   713   mova                   m3, [leftq+16]
   714   palignr                m5, m3, m7, 15
   715   palignr                m0, m3, m7, 14
   717   X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m3, m5, m0, m2          ; 3-tap avg Bh -
   718   pavgb                  m5, m3                            ; Ah -
   719   punpcklbw              m3, m2, m5                        ; A-B8 ... A-B1
   720   punpckhbw              m2, m5                            ; A-B9 ... A-Bg
   721   pshufb                 m3, [GLOBAL(sh_bfedcba9876543210)]
   722   pshufb                 m2, [GLOBAL(sh_bfedcba9876543210)]
   724   palignr                m7, m6, m4, 14
   725   palignr                m0, m4, m3, 14
   726   mova  [dstq             ], m0
   727   mova  [dstq+16          ], m7
   728   palignr                m7, m6, m4, 12
   729   palignr                m0, m4, m3, 12
   730   mova  [dstq+strideq     ], m0
   731   mova  [dstq+strideq+16  ], m7
   732   palignr                m7, m6, m4, 10
   733   palignr                m0, m4, m3, 10
   734   mova  [dstq+strideq*2   ], m0
   735   mova  [dstq+strideq*2+16], m7
   736   palignr                m7, m6, m4, 8
   737   palignr                m0, m4, m3, 8
   738   mova  [dstq+stride3q    ], m0
   739   mova  [dstq+stride3q+16 ], m7
   740   lea                  dstq, [dstq+strideq*4]
   741   palignr                m7, m6, m4, 6
   742   palignr                m0, m4, m3, 6
   743   mova  [dstq             ], m0
   744   mova  [dstq+16          ], m7
   745   palignr                m7, m6, m4, 4
   746   palignr                m0, m4, m3, 4
   747   mova  [dstq+strideq     ], m0
   748   mova  [dstq+strideq+16  ], m7
   749   palignr                m7, m6, m4, 2
   750   palignr                m0, m4, m3, 2
   751   mova  [dstq+strideq*2   ], m0
   752   mova  [dstq+strideq*2+16], m7
   753   mova  [dstq+stride3q    ], m3
   754   mova  [dstq+stride3q+16 ], m4
   755   lea                  dstq, [dstq+strideq*4]
   757   palignr                m7, m4, m3, 14
   758   palignr                m0, m3, m2, 14
   759   mova  [dstq             ], m0
   760   mova  [dstq+16          ], m7
   761   palignr                m7, m4, m3, 12
   762   palignr                m0, m3, m2, 12
   763   mova  [dstq+strideq     ], m0
   764   mova  [dstq+strideq+16  ], m7
   765   palignr                m7, m4, m3, 10
   766   palignr                m0, m3, m2, 10
   767   mova  [dstq+strideq*2   ], m0
   768   mova  [dstq+strideq*2+16], m7
   769   palignr                m7, m4, m3, 8
   770   palignr                m0, m3, m2, 8
   771   mova  [dstq+stride3q    ], m0
   772   mova  [dstq+stride3q+16 ], m7
   773   lea                  dstq, [dstq+strideq*4]
   774   palignr                m7, m4, m3, 6
   775   palignr                m0, m3, m2, 6
   776   mova  [dstq             ], m0
   777   mova  [dstq+16          ], m7
   778   palignr                m7, m4, m3, 4
   779   palignr                m0, m3, m2, 4
   780   mova  [dstq+strideq     ], m0
   781   mova  [dstq+strideq+16  ], m7
   782   palignr                m7, m4, m3, 2
   783   palignr                m0, m3, m2, 2
   784   mova  [dstq+strideq*2   ], m0
   785   mova  [dstq+strideq*2+16], m7
   786   mova  [dstq+stride3q    ], m2
   787   mova  [dstq+stride3q+16 ], m3
   789   RESTORE_GOT
   790   RET
   792 INIT_MMX ssse3
   793 cglobal d207_predictor_4x4, 4, 5, 4, dst, stride, unused, left, goffset
   794   GET_GOT     goffsetq
   795   movd                m0, [leftq]                ; abcd [byte]
   796   pshufb              m1, m0, [GLOBAL(sh_b1233)] ; bcdd [byte]
   797   pshufb              m3, m0, [GLOBAL(sh_b2333)] ; cddd
   799   X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m3, m2
   800   pavgb               m1, m0             ; ab, bc, cd, d [byte]
   802   punpcklbw           m1, m2             ; ab, a2bc, bc, b2cd, cd, c3d, d, d
   803   movd    [dstq        ], m1
   804   psrlq               m1, 16             ; bc, b2cd, cd, c3d, d, d
   805   movd    [dstq+strideq], m1
   806   lea               dstq, [dstq+strideq*2]
   807   psrlq               m1, 16             ; cd, c3d, d, d
   808   movd    [dstq        ], m1
   809   pshufw              m1, m1, q1111      ; d, d, d, d
   810   movd    [dstq+strideq], m1
   811   RESTORE_GOT
   812   RET
   814 INIT_XMM ssse3
   815 cglobal d207_predictor_8x8, 4, 5, 4, dst, stride, stride3, left, goffset
   816   GET_GOT     goffsetq
   817   movq                m3, [leftq]            ; abcdefgh [byte]
   818   lea           stride3q, [strideq*3]
   820   pshufb              m1, m3, [GLOBAL(sh_b2345677777777777)]
   821   pshufb              m0, m3, [GLOBAL(sh_b0123456777777777)]
   822   pshufb              m2, m3, [GLOBAL(sh_b1234567777777777)]
   824   X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m2, m1, m3
   825   pavgb               m0, m2
   826   punpcklbw           m0, m3        ; interleaved output
   828   movq  [dstq          ], m0
   829   psrldq              m0, 2
   830   movq  [dstq+strideq  ], m0
   831   psrldq              m0, 2
   832   movq  [dstq+strideq*2], m0
   833   psrldq              m0, 2
   834   movq  [dstq+stride3q ], m0
   835   lea               dstq, [dstq+strideq*4]
   836   pshufhw             m0, m0, q0000 ; de, d2ef, ef, e2fg, fg, f2gh, gh, g3h, 8xh
   837   psrldq              m0, 2
   838   movq  [dstq          ], m0
   839   psrldq              m0, 2
   840   movq  [dstq+strideq  ], m0
   841   psrldq              m0, 2
   842   movq  [dstq+strideq*2], m0
   843   psrldq              m0, 2
   844   movq  [dstq+stride3q ], m0
   845   RESTORE_GOT
   846   RET
   848 INIT_XMM ssse3
   849 cglobal d207_predictor_16x16, 4, 5, 5, dst, stride, stride3, left, goffset
   850   GET_GOT     goffsetq
   851   lea           stride3q, [strideq*3]
   852   mova                m0, [leftq]            ; abcdefghijklmnop [byte]
   853   pshufb              m1, m0, [GLOBAL(sh_b123456789abcdeff)] ; bcdefghijklmnopp
   854   pshufb              m2, m0, [GLOBAL(sh_b23456789abcdefff)]
   856   X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3
   857   pavgb               m1, m0                 ; ab, bc, cd .. no, op, pp [byte]
   859   punpckhbw           m4, m1, m3    ; interleaved input
   860   punpcklbw           m1, m3        ; interleaved output
   861   mova  [dstq          ], m1
   862   palignr             m3, m4, m1, 2
   863   mova  [dstq+strideq  ], m3
   864   palignr             m3, m4, m1, 4
   865   mova  [dstq+strideq*2], m3
   866   palignr             m3, m4, m1, 6
   867   mova  [dstq+stride3q ], m3
   868   lea               dstq, [dstq+strideq*4]
   869   palignr             m3, m4, m1, 8
   870   mova  [dstq          ], m3
   871   palignr             m3, m4, m1, 10
   872   mova  [dstq+strideq  ], m3
   873   palignr             m3, m4, m1, 12
   874   mova  [dstq+strideq*2], m3
   875   palignr             m3, m4, m1, 14
   876   mova  [dstq+stride3q ], m3
   877   DEFINE_ARGS dst, stride, stride3, line
   878   mov              lined, 2
   879   mova                m0, [GLOBAL(sh_b23456789abcdefff)]
   880 .loop:
   881   lea               dstq, [dstq+strideq*4]
   882   mova  [dstq          ], m4
   883   pshufb              m4, m0
   884   mova  [dstq+strideq  ], m4
   885   pshufb              m4, m0
   886   mova  [dstq+strideq*2], m4
   887   pshufb              m4, m0
   888   mova  [dstq+stride3q ], m4
   889   pshufb              m4, m0
   890   dec              lined
   891   jnz .loop
   892   RESTORE_GOT
   893   REP_RET
   895 INIT_XMM ssse3
   896 cglobal d207_predictor_32x32, 4, 5, 8, dst, stride, stride3, left, goffset
   897   GET_GOT     goffsetq
   898   lea           stride3q, [strideq*3]
   899   mova                m1, [leftq]              ;  0-15 [byte]
   900   mova                m2, [leftq+16]           ; 16-31 [byte]
   901   pshufb              m0, m2, [GLOBAL(sh_b23456789abcdefff)]
   902   pshufb              m4, m2, [GLOBAL(sh_b123456789abcdeff)]
   904   X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m2, m4, m0, m3
   905   palignr             m6, m2, m1, 1
   906   palignr             m5, m2, m1, 2
   907   pavgb               m2, m4         ; high 16px even lines
   909   X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m1, m6, m5, m0
   910   pavgb                   m1, m6         ; low 16px even lines
   912   punpckhbw               m6, m1, m0               ; interleaved output 2
   913   punpcklbw               m1, m0                   ; interleaved output 1
   915   punpckhbw               m7, m2, m3               ; interleaved output 4
   916   punpcklbw               m2, m3                   ; interleaved output 3
   918   ; output 1st 8 lines (and half of 2nd 8 lines)
   919   DEFINE_ARGS dst, stride, stride3, dst8
   920   lea                  dst8q, [dstq+strideq*8]
   921   mova  [dstq              ], m1
   922   mova  [dstq           +16], m6
   923   mova  [dst8q             ], m6
   924   palignr             m0, m6, m1, 2
   925   palignr             m4, m2, m6, 2
   926   mova  [dstq +strideq     ], m0
   927   mova  [dstq +strideq  +16], m4
   928   mova  [dst8q+strideq     ], m4
   929   palignr             m0, m6, m1, 4
   930   palignr             m4, m2, m6, 4
   931   mova  [dstq +strideq*2   ], m0
   932   mova  [dstq +strideq*2+16], m4
   933   mova  [dst8q+strideq*2   ], m4
   934   palignr             m0, m6, m1, 6
   935   palignr             m4, m2, m6, 6
   936   mova  [dstq +stride3q    ], m0
   937   mova  [dstq +stride3q +16], m4
   938   mova  [dst8q+stride3q    ], m4
   939   lea               dstq, [dstq +strideq*4]
   940   lea              dst8q, [dst8q+strideq*4]
   941   palignr             m0, m6, m1, 8
   942   palignr             m4, m2, m6, 8
   943   mova  [dstq              ], m0
   944   mova  [dstq           +16], m4
   945   mova  [dst8q             ], m4
   946   palignr             m0, m6, m1, 10
   947   palignr             m4, m2, m6, 10
   948   mova  [dstq +strideq     ], m0
   949   mova  [dstq +strideq  +16], m4
   950   mova  [dst8q+strideq     ], m4
   951   palignr             m0, m6, m1, 12
   952   palignr             m4, m2, m6, 12
   953   mova  [dstq +strideq*2   ], m0
   954   mova  [dstq +strideq*2+16], m4
   955   mova  [dst8q+strideq*2   ], m4
   956   palignr             m0, m6, m1, 14
   957   palignr             m4, m2, m6, 14
   958   mova  [dstq +stride3q    ], m0
   959   mova  [dstq +stride3q +16], m4
   960   mova  [dst8q+stride3q    ], m4
   961   lea               dstq, [dstq+strideq*4]
   962   lea              dst8q, [dst8q+strideq*4]
   964   ; output 2nd half of 2nd 8 lines and half of 3rd 8 lines
   965   mova  [dstq           +16], m2
   966   mova  [dst8q             ], m2
   967   palignr             m4, m7, m2, 2
   968   mova  [dstq +strideq  +16], m4
   969   mova  [dst8q+strideq     ], m4
   970   palignr             m4, m7, m2, 4
   971   mova  [dstq +strideq*2+16], m4
   972   mova  [dst8q+strideq*2   ], m4
   973   palignr             m4, m7, m2, 6
   974   mova  [dstq +stride3q +16], m4
   975   mova  [dst8q+stride3q    ], m4
   976   lea               dstq, [dstq+strideq*4]
   977   lea              dst8q, [dst8q+strideq*4]
   978   palignr             m4, m7, m2, 8
   979   mova  [dstq           +16], m4
   980   mova  [dst8q             ], m4
   981   palignr             m4, m7, m2, 10
   982   mova  [dstq +strideq  +16], m4
   983   mova  [dst8q+strideq     ], m4
   984   palignr             m4, m7, m2, 12
   985   mova  [dstq +strideq*2+16], m4
   986   mova  [dst8q+strideq*2   ], m4
   987   palignr             m4, m7, m2, 14
   988   mova  [dstq +stride3q +16], m4
   989   mova  [dst8q+stride3q    ], m4
   990   lea               dstq, [dstq+strideq*4]
   991   lea              dst8q, [dst8q+strideq*4]
   993   ; output 2nd half of 3rd 8 lines and half of 4th 8 lines
   994   mova                m0, [GLOBAL(sh_b23456789abcdefff)]
   995   mova  [dstq           +16], m7
   996   mova  [dst8q             ], m7
   997   pshufb              m7, m0
   998   mova  [dstq +strideq  +16], m7
   999   mova  [dst8q+strideq     ], m7
  1000   pshufb              m7, m0
  1001   mova  [dstq +strideq*2+16], m7
  1002   mova  [dst8q+strideq*2   ], m7
  1003   pshufb              m7, m0
  1004   mova  [dstq +stride3q +16], m7
  1005   mova  [dst8q+stride3q    ], m7
  1006   pshufb              m7, m0
  1007   lea               dstq, [dstq+strideq*4]
  1008   lea              dst8q, [dst8q+strideq*4]
  1009   mova  [dstq           +16], m7
  1010   mova  [dst8q             ], m7
  1011   pshufb              m7, m0
  1012   mova  [dstq +strideq  +16], m7
  1013   mova  [dst8q+strideq     ], m7
  1014   pshufb              m7, m0
  1015   mova  [dstq +strideq*2+16], m7
  1016   mova  [dst8q+strideq*2   ], m7
  1017   pshufb              m7, m0
  1018   mova  [dstq +stride3q +16], m7
  1019   mova  [dst8q+stride3q    ], m7
  1020   pshufb              m7, m0
  1021   lea               dstq, [dstq+strideq*4]
  1023   ; output last half of 4th 8 lines
  1024   mova  [dstq           +16], m7
  1025   mova  [dstq +strideq  +16], m7
  1026   mova  [dstq +strideq*2+16], m7
  1027   mova  [dstq +stride3q +16], m7
  1028   lea               dstq, [dstq+strideq*4]
  1029   mova  [dstq           +16], m7
  1030   mova  [dstq +strideq  +16], m7
  1031   mova  [dstq +strideq*2+16], m7
  1032   mova  [dstq +stride3q +16], m7
  1034   ; done!
  1035   RESTORE_GOT
  1036   RET

mercurial