media/libvpx/vp9/common/x86/vp9_postproc_mmx.asm

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

     1 ;
     2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
     3 ;
     4 ;  Use of this source code is governed by a BSD-style license
     5 ;  that can be found in the LICENSE file in the root of the source
     6 ;  tree. An additional intellectual property rights grant can be found
     7 ;  in the file PATENTS.  All contributing project authors may
     8 ;  be found in the AUTHORS file in the root of the source tree.
     9 ;
    12 %include "vpx_ports/x86_abi_support.asm"
    14 %define VP9_FILTER_WEIGHT 128
    15 %define VP9_FILTER_SHIFT  7
    17 ;void vp9_post_proc_down_and_across_mmx
    18 ;(
    19 ;    unsigned char *src_ptr,
    20 ;    unsigned char *dst_ptr,
    21 ;    int src_pixels_per_line,
    22 ;    int dst_pixels_per_line,
    23 ;    int rows,
    24 ;    int cols,
    25 ;    int flimit
    26 ;)
    27 global sym(vp9_post_proc_down_and_across_mmx) PRIVATE
    28 sym(vp9_post_proc_down_and_across_mmx):
    29     push        rbp
    30     mov         rbp, rsp
    31     SHADOW_ARGS_TO_STACK 7
    32     GET_GOT     rbx
    33     push        rsi
    34     push        rdi
    35     ; end prolog
    37 %if ABI_IS_32BIT=1 && CONFIG_PIC=1
    38     ; move the global rd onto the stack, since we don't have enough registers
    39     ; to do PIC addressing
    40     movq        mm0, [GLOBAL(rd)]
    41     sub         rsp, 8
    42     movq        [rsp], mm0
    43 %define RD [rsp]
    44 %else
    45 %define RD [GLOBAL(rd)]
    46 %endif
    48         push        rbx
    49         lea         rbx, [GLOBAL(Blur)]
    50         movd        mm2, dword ptr arg(6) ;flimit
    51         punpcklwd   mm2, mm2
    52         punpckldq   mm2, mm2
    54         mov         rsi,        arg(0) ;src_ptr
    55         mov         rdi,        arg(1) ;dst_ptr
    57         movsxd      rcx, DWORD PTR arg(4) ;rows
    58         movsxd      rax, DWORD PTR arg(2) ;src_pixels_per_line ; destination pitch?
    59         pxor        mm0, mm0              ; mm0 = 00000000
    61 .nextrow:
    63         xor         rdx,        rdx       ; clear out rdx for use as loop counter
    64 .nextcol:
    66         pxor        mm7, mm7              ; mm7 = 00000000
    67         movq        mm6, [rbx + 32 ]      ; mm6 = kernel 2 taps
    68         movq        mm3, [rsi]            ; mm4 = r0 p0..p7
    69         punpcklbw   mm3, mm0              ; mm3 = p0..p3
    70         movq        mm1, mm3              ; mm1 = p0..p3
    71         pmullw      mm3, mm6              ; mm3 *= kernel 2 modifiers
    73         movq        mm6, [rbx + 48]       ; mm6 = kernel 3 taps
    74         movq        mm5, [rsi + rax]      ; mm4 = r1 p0..p7
    75         punpcklbw   mm5, mm0              ; mm5 = r1 p0..p3
    76         pmullw      mm6, mm5              ; mm6 *= p0..p3 * kernel 3 modifiers
    77         paddusw     mm3, mm6              ; mm3 += mm6
    79         ; thresholding
    80         movq        mm7, mm1              ; mm7 = r0 p0..p3
    81         psubusw     mm7, mm5              ; mm7 = r0 p0..p3 - r1 p0..p3
    82         psubusw     mm5, mm1              ; mm5 = r1 p0..p3 - r0 p0..p3
    83         paddusw     mm7, mm5              ; mm7 = abs(r0 p0..p3 - r1 p0..p3)
    84         pcmpgtw     mm7, mm2
    86         movq        mm6, [rbx + 64 ]      ; mm6 = kernel 4 modifiers
    87         movq        mm5, [rsi + 2*rax]    ; mm4 = r2 p0..p7
    88         punpcklbw   mm5, mm0              ; mm5 = r2 p0..p3
    89         pmullw      mm6, mm5              ; mm5 *= kernel 4 modifiers
    90         paddusw     mm3, mm6              ; mm3 += mm5
    92         ; thresholding
    93         movq        mm6, mm1              ; mm6 = r0 p0..p3
    94         psubusw     mm6, mm5              ; mm6 = r0 p0..p3 - r2 p0..p3
    95         psubusw     mm5, mm1              ; mm5 = r2 p0..p3 - r2 p0..p3
    96         paddusw     mm6, mm5              ; mm6 = abs(r0 p0..p3 - r2 p0..p3)
    97         pcmpgtw     mm6, mm2
    98         por         mm7, mm6              ; accumulate thresholds
   101         neg         rax
   102         movq        mm6, [rbx ]           ; kernel 0 taps
   103         movq        mm5, [rsi+2*rax]      ; mm4 = r-2 p0..p7
   104         punpcklbw   mm5, mm0              ; mm5 = r-2 p0..p3
   105         pmullw      mm6, mm5              ; mm5 *= kernel 0 modifiers
   106         paddusw     mm3, mm6              ; mm3 += mm5
   108         ; thresholding
   109         movq        mm6, mm1              ; mm6 = r0 p0..p3
   110         psubusw     mm6, mm5              ; mm6 = p0..p3 - r-2 p0..p3
   111         psubusw     mm5, mm1              ; mm5 = r-2 p0..p3 - p0..p3
   112         paddusw     mm6, mm5              ; mm6 = abs(r0 p0..p3 - r-2 p0..p3)
   113         pcmpgtw     mm6, mm2
   114         por         mm7, mm6              ; accumulate thresholds
   116         movq        mm6, [rbx + 16]       ; kernel 1 taps
   117         movq        mm4, [rsi+rax]        ; mm4 = r-1 p0..p7
   118         punpcklbw   mm4, mm0              ; mm4 = r-1 p0..p3
   119         pmullw      mm6, mm4              ; mm4 *= kernel 1 modifiers.
   120         paddusw     mm3, mm6              ; mm3 += mm5
   122         ; thresholding
   123         movq        mm6, mm1              ; mm6 = r0 p0..p3
   124         psubusw     mm6, mm4              ; mm6 = p0..p3 - r-2 p0..p3
   125         psubusw     mm4, mm1              ; mm5 = r-1 p0..p3 - p0..p3
   126         paddusw     mm6, mm4              ; mm6 = abs(r0 p0..p3 - r-1 p0..p3)
   127         pcmpgtw     mm6, mm2
   128         por         mm7, mm6              ; accumulate thresholds
   131         paddusw     mm3, RD               ; mm3 += round value
   132         psraw       mm3, VP9_FILTER_SHIFT     ; mm3 /= 128
   134         pand        mm1, mm7              ; mm1 select vals > thresh from source
   135         pandn       mm7, mm3              ; mm7 select vals < thresh from blurred result
   136         paddusw     mm1, mm7              ; combination
   138         packuswb    mm1, mm0              ; pack to bytes
   140         movd        [rdi], mm1            ;
   141         neg         rax                   ; pitch is positive
   144         add         rsi, 4
   145         add         rdi, 4
   146         add         rdx, 4
   148         cmp         edx, dword ptr arg(5) ;cols
   149         jl          .nextcol
   150         ; done with the all cols, start the across filtering in place
   151         sub         rsi, rdx
   152         sub         rdi, rdx
   155         push        rax
   156         xor         rdx,    rdx
   157         mov         rax,    [rdi-4];
   159 .acrossnextcol:
   160         pxor        mm7, mm7              ; mm7 = 00000000
   161         movq        mm6, [rbx + 32 ]      ;
   162         movq        mm4, [rdi+rdx]        ; mm4 = p0..p7
   163         movq        mm3, mm4              ; mm3 = p0..p7
   164         punpcklbw   mm3, mm0              ; mm3 = p0..p3
   165         movq        mm1, mm3              ; mm1 = p0..p3
   166         pmullw      mm3, mm6              ; mm3 *= kernel 2 modifiers
   168         movq        mm6, [rbx + 48]
   169         psrlq       mm4, 8                ; mm4 = p1..p7
   170         movq        mm5, mm4              ; mm5 = p1..p7
   171         punpcklbw   mm5, mm0              ; mm5 = p1..p4
   172         pmullw      mm6, mm5              ; mm6 *= p1..p4 * kernel 3 modifiers
   173         paddusw     mm3, mm6              ; mm3 += mm6
   175         ; thresholding
   176         movq        mm7, mm1              ; mm7 = p0..p3
   177         psubusw     mm7, mm5              ; mm7 = p0..p3 - p1..p4
   178         psubusw     mm5, mm1              ; mm5 = p1..p4 - p0..p3
   179         paddusw     mm7, mm5              ; mm7 = abs(p0..p3 - p1..p4)
   180         pcmpgtw     mm7, mm2
   182         movq        mm6, [rbx + 64 ]
   183         psrlq       mm4, 8                ; mm4 = p2..p7
   184         movq        mm5, mm4              ; mm5 = p2..p7
   185         punpcklbw   mm5, mm0              ; mm5 = p2..p5
   186         pmullw      mm6, mm5              ; mm5 *= kernel 4 modifiers
   187         paddusw     mm3, mm6              ; mm3 += mm5
   189         ; thresholding
   190         movq        mm6, mm1              ; mm6 = p0..p3
   191         psubusw     mm6, mm5              ; mm6 = p0..p3 - p1..p4
   192         psubusw     mm5, mm1              ; mm5 = p1..p4 - p0..p3
   193         paddusw     mm6, mm5              ; mm6 = abs(p0..p3 - p1..p4)
   194         pcmpgtw     mm6, mm2
   195         por         mm7, mm6              ; accumulate thresholds
   198         movq        mm6, [rbx ]
   199         movq        mm4, [rdi+rdx-2]      ; mm4 = p-2..p5
   200         movq        mm5, mm4              ; mm5 = p-2..p5
   201         punpcklbw   mm5, mm0              ; mm5 = p-2..p1
   202         pmullw      mm6, mm5              ; mm5 *= kernel 0 modifiers
   203         paddusw     mm3, mm6              ; mm3 += mm5
   205         ; thresholding
   206         movq        mm6, mm1              ; mm6 = p0..p3
   207         psubusw     mm6, mm5              ; mm6 = p0..p3 - p1..p4
   208         psubusw     mm5, mm1              ; mm5 = p1..p4 - p0..p3
   209         paddusw     mm6, mm5              ; mm6 = abs(p0..p3 - p1..p4)
   210         pcmpgtw     mm6, mm2
   211         por         mm7, mm6              ; accumulate thresholds
   213         movq        mm6, [rbx + 16]
   214         psrlq       mm4, 8                ; mm4 = p-1..p5
   215         punpcklbw   mm4, mm0              ; mm4 = p-1..p2
   216         pmullw      mm6, mm4              ; mm4 *= kernel 1 modifiers.
   217         paddusw     mm3, mm6              ; mm3 += mm5
   219         ; thresholding
   220         movq        mm6, mm1              ; mm6 = p0..p3
   221         psubusw     mm6, mm4              ; mm6 = p0..p3 - p1..p4
   222         psubusw     mm4, mm1              ; mm5 = p1..p4 - p0..p3
   223         paddusw     mm6, mm4              ; mm6 = abs(p0..p3 - p1..p4)
   224         pcmpgtw     mm6, mm2
   225         por         mm7, mm6              ; accumulate thresholds
   227         paddusw     mm3, RD               ; mm3 += round value
   228         psraw       mm3, VP9_FILTER_SHIFT     ; mm3 /= 128
   230         pand        mm1, mm7              ; mm1 select vals > thresh from source
   231         pandn       mm7, mm3              ; mm7 select vals < thresh from blurred result
   232         paddusw     mm1, mm7              ; combination
   234         packuswb    mm1, mm0              ; pack to bytes
   235         mov         DWORD PTR [rdi+rdx-4],  eax   ; store previous four bytes
   236         movd        eax,    mm1
   238         add         rdx, 4
   239         cmp         edx, dword ptr arg(5) ;cols
   240         jl          .acrossnextcol;
   242         mov         DWORD PTR [rdi+rdx-4],  eax
   243         pop         rax
   245         ; done with this rwo
   246         add         rsi,rax               ; next line
   247         movsxd      rax, dword ptr arg(3) ;dst_pixels_per_line ; destination pitch?
   248         add         rdi,rax               ; next destination
   249         movsxd      rax, dword ptr arg(2) ;src_pixels_per_line ; destination pitch?
   251         dec         rcx                   ; decrement count
   252         jnz         .nextrow               ; next row
   253         pop         rbx
   255     ; begin epilog
   256     pop rdi
   257     pop rsi
   258     RESTORE_GOT
   259     UNSHADOW_ARGS
   260     pop         rbp
   261     ret
   262 %undef RD
   265 ;void vp9_mbpost_proc_down_mmx(unsigned char *dst,
   266 ;                             int pitch, int rows, int cols,int flimit)
   267 extern sym(vp9_rv)
   268 global sym(vp9_mbpost_proc_down_mmx) PRIVATE
   269 sym(vp9_mbpost_proc_down_mmx):
   270     push        rbp
   271     mov         rbp, rsp
   272     SHADOW_ARGS_TO_STACK 5
   273     GET_GOT     rbx
   274     push        rsi
   275     push        rdi
   276     ; end prolog
   278     ALIGN_STACK 16, rax
   279     sub         rsp, 136
   281     ; unsigned char d[16][8] at [rsp]
   282     ; create flimit2 at [rsp+128]
   283     mov         eax, dword ptr arg(4) ;flimit
   284     mov         [rsp+128], eax
   285     mov         [rsp+128+4], eax
   286 %define flimit2 [rsp+128]
   288 %if ABI_IS_32BIT=0
   289     lea         r8,       [GLOBAL(sym(vp9_rv))]
   290 %endif
   292     ;rows +=8;
   293     add         dword ptr arg(2), 8
   295     ;for(c=0; c<cols; c+=4)
   296 .loop_col:
   297             mov         rsi,        arg(0)  ;s
   298             pxor        mm0,        mm0     ;
   300             movsxd      rax,        dword ptr arg(1) ;pitch       ;
   301             neg         rax                                     ; rax = -pitch
   303             lea         rsi,        [rsi + rax*8];              ; rdi = s[-pitch*8]
   304             neg         rax
   307             pxor        mm5,        mm5
   308             pxor        mm6,        mm6     ;
   310             pxor        mm7,        mm7     ;
   311             mov         rdi,        rsi
   313             mov         rcx,        15          ;
   315 .loop_initvar:
   316             movd        mm1,        DWORD PTR [rdi];
   317             punpcklbw   mm1,        mm0     ;
   319             paddw       mm5,        mm1     ;
   320             pmullw      mm1,        mm1     ;
   322             movq        mm2,        mm1     ;
   323             punpcklwd   mm1,        mm0     ;
   325             punpckhwd   mm2,        mm0     ;
   326             paddd       mm6,        mm1     ;
   328             paddd       mm7,        mm2     ;
   329             lea         rdi,        [rdi+rax]   ;
   331             dec         rcx
   332             jne         .loop_initvar
   333             ;save the var and sum
   334             xor         rdx,        rdx
   335 .loop_row:
   336             movd        mm1,        DWORD PTR [rsi]     ; [s-pitch*8]
   337             movd        mm2,        DWORD PTR [rdi]     ; [s+pitch*7]
   339             punpcklbw   mm1,        mm0
   340             punpcklbw   mm2,        mm0
   342             paddw       mm5,        mm2
   343             psubw       mm5,        mm1
   345             pmullw      mm2,        mm2
   346             movq        mm4,        mm2
   348             punpcklwd   mm2,        mm0
   349             punpckhwd   mm4,        mm0
   351             paddd       mm6,        mm2
   352             paddd       mm7,        mm4
   354             pmullw      mm1,        mm1
   355             movq        mm2,        mm1
   357             punpcklwd   mm1,        mm0
   358             psubd       mm6,        mm1
   360             punpckhwd   mm2,        mm0
   361             psubd       mm7,        mm2
   364             movq        mm3,        mm6
   365             pslld       mm3,        4
   367             psubd       mm3,        mm6
   368             movq        mm1,        mm5
   370             movq        mm4,        mm5
   371             pmullw      mm1,        mm1
   373             pmulhw      mm4,        mm4
   374             movq        mm2,        mm1
   376             punpcklwd   mm1,        mm4
   377             punpckhwd   mm2,        mm4
   379             movq        mm4,        mm7
   380             pslld       mm4,        4
   382             psubd       mm4,        mm7
   384             psubd       mm3,        mm1
   385             psubd       mm4,        mm2
   387             psubd       mm3,        flimit2
   388             psubd       mm4,        flimit2
   390             psrad       mm3,        31
   391             psrad       mm4,        31
   393             packssdw    mm3,        mm4
   394             packsswb    mm3,        mm0
   396             movd        mm1,        DWORD PTR [rsi+rax*8]
   398             movq        mm2,        mm1
   399             punpcklbw   mm1,        mm0
   401             paddw       mm1,        mm5
   402             mov         rcx,        rdx
   404             and         rcx,        127
   405 %if ABI_IS_32BIT=1 && CONFIG_PIC=1
   406             push        rax
   407             lea         rax,        [GLOBAL(sym(vp9_rv))]
   408             movq        mm4,        [rax + rcx*2] ;vp9_rv[rcx*2]
   409             pop         rax
   410 %elif ABI_IS_32BIT=0
   411             movq        mm4,        [r8 + rcx*2] ;vp9_rv[rcx*2]
   412 %else
   413             movq        mm4,        [sym(vp9_rv) + rcx*2]
   414 %endif
   415             paddw       mm1,        mm4
   416             ;paddw     xmm1,       eight8s
   417             psraw       mm1,        4
   419             packuswb    mm1,        mm0
   420             pand        mm1,        mm3
   422             pandn       mm3,        mm2
   423             por         mm1,        mm3
   425             and         rcx,        15
   426             movd        DWORD PTR   [rsp+rcx*4], mm1 ;d[rcx*4]
   428             mov         rcx,        rdx
   429             sub         rcx,        8
   431             and         rcx,        15
   432             movd        mm1,        DWORD PTR [rsp+rcx*4] ;d[rcx*4]
   434             movd        [rsi],      mm1
   435             lea         rsi,        [rsi+rax]
   437             lea         rdi,        [rdi+rax]
   438             add         rdx,        1
   440             cmp         edx,        dword arg(2) ;rows
   441             jl          .loop_row
   444         add         dword arg(0), 4 ; s += 4
   445         sub         dword arg(3), 4 ; cols -= 4
   446         cmp         dword arg(3), 0
   447         jg          .loop_col
   449     add         rsp, 136
   450     pop         rsp
   452     ; begin epilog
   453     pop rdi
   454     pop rsi
   455     RESTORE_GOT
   456     UNSHADOW_ARGS
   457     pop         rbp
   458     ret
   459 %undef flimit2
   462 ;void vp9_plane_add_noise_mmx (unsigned char *start, unsigned char *noise,
   463 ;                            unsigned char blackclamp[16],
   464 ;                            unsigned char whiteclamp[16],
   465 ;                            unsigned char bothclamp[16],
   466 ;                            unsigned int width, unsigned int height, int pitch)
   467 extern sym(rand)
   468 global sym(vp9_plane_add_noise_mmx) PRIVATE
   469 sym(vp9_plane_add_noise_mmx):
   470     push        rbp
   471     mov         rbp, rsp
   472     SHADOW_ARGS_TO_STACK 8
   473     GET_GOT     rbx
   474     push        rsi
   475     push        rdi
   476     ; end prolog
   478 .addnoise_loop:
   479     call sym(rand) WRT_PLT
   480     mov     rcx, arg(1) ;noise
   481     and     rax, 0xff
   482     add     rcx, rax
   484     ; we rely on the fact that the clamping vectors are stored contiguously
   485     ; in black/white/both order. Note that we have to reload this here because
   486     ; rdx could be trashed by rand()
   487     mov     rdx, arg(2) ; blackclamp
   490             mov     rdi, rcx
   491             movsxd  rcx, dword arg(5) ;[Width]
   492             mov     rsi, arg(0) ;Pos
   493             xor         rax,rax
   495 .addnoise_nextset:
   496             movq        mm1,[rsi+rax]         ; get the source
   498             psubusb     mm1, [rdx]    ;blackclamp        ; clamp both sides so we don't outrange adding noise
   499             paddusb     mm1, [rdx+32] ;bothclamp
   500             psubusb     mm1, [rdx+16] ;whiteclamp
   502             movq        mm2,[rdi+rax]         ; get the noise for this line
   503             paddb       mm1,mm2              ; add it in
   504             movq        [rsi+rax],mm1         ; store the result
   506             add         rax,8                 ; move to the next line
   508             cmp         rax, rcx
   509             jl          .addnoise_nextset
   511     movsxd  rax, dword arg(7) ; Pitch
   512     add     arg(0), rax ; Start += Pitch
   513     sub     dword arg(6), 1   ; Height -= 1
   514     jg      .addnoise_loop
   516     ; begin epilog
   517     pop rdi
   518     pop rsi
   519     RESTORE_GOT
   520     UNSHADOW_ARGS
   521     pop         rbp
   522     ret
   525 SECTION_RODATA
   526 align 16
   527 Blur:
   528     times 16 dw 16
   529     times  8 dw 64
   530     times 16 dw 16
   531     times  8 dw  0
   533 rd:
   534     times 4 dw 0x40

mercurial