media/libvpx/vp9/encoder/x86/vp9_subtract_sse2.asm

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

michael@0 1 ;
michael@0 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
michael@0 3 ;
michael@0 4 ; Use of this source code is governed by a BSD-style license
michael@0 5 ; that can be found in the LICENSE file in the root of the source
michael@0 6 ; tree. An additional intellectual property rights grant can be found
michael@0 7 ; in the file PATENTS. All contributing project authors may
michael@0 8 ; be found in the AUTHORS file in the root of the source tree.
michael@0 9 ;
michael@0 10
michael@0 11 %include "third_party/x86inc/x86inc.asm"
michael@0 12
michael@0 13 SECTION .text
michael@0 14
michael@0 15 ; void vp9_subtract_block(int rows, int cols,
michael@0 16 ; int16_t *diff, ptrdiff_t diff_stride,
michael@0 17 ; const uint8_t *src, ptrdiff_t src_stride,
michael@0 18 ; const uint8_t *pred, ptrdiff_t pred_stride)
michael@0 19
michael@0 20 INIT_XMM sse2
michael@0 21 cglobal subtract_block, 7, 7, 8, \
michael@0 22 rows, cols, diff, diff_stride, src, src_stride, \
michael@0 23 pred, pred_stride
michael@0 24 %define pred_str colsq
michael@0 25 pxor m7, m7 ; dedicated zero register
michael@0 26 cmp colsd, 4
michael@0 27 je .case_4
michael@0 28 cmp colsd, 8
michael@0 29 je .case_8
michael@0 30 cmp colsd, 16
michael@0 31 je .case_16
michael@0 32 cmp colsd, 32
michael@0 33 je .case_32
michael@0 34
michael@0 35 %macro loop16 6
michael@0 36 mova m0, [srcq+%1]
michael@0 37 mova m4, [srcq+%2]
michael@0 38 mova m1, [predq+%3]
michael@0 39 mova m5, [predq+%4]
michael@0 40 punpckhbw m2, m0, m7
michael@0 41 punpckhbw m3, m1, m7
michael@0 42 punpcklbw m0, m7
michael@0 43 punpcklbw m1, m7
michael@0 44 psubw m2, m3
michael@0 45 psubw m0, m1
michael@0 46 punpckhbw m1, m4, m7
michael@0 47 punpckhbw m3, m5, m7
michael@0 48 punpcklbw m4, m7
michael@0 49 punpcklbw m5, m7
michael@0 50 psubw m1, m3
michael@0 51 psubw m4, m5
michael@0 52 mova [diffq+mmsize*0+%5], m0
michael@0 53 mova [diffq+mmsize*1+%5], m2
michael@0 54 mova [diffq+mmsize*0+%6], m4
michael@0 55 mova [diffq+mmsize*1+%6], m1
michael@0 56 %endmacro
michael@0 57
michael@0 58 mov pred_str, pred_stridemp
michael@0 59 .loop_64:
michael@0 60 loop16 0*mmsize, 1*mmsize, 0*mmsize, 1*mmsize, 0*mmsize, 2*mmsize
michael@0 61 loop16 2*mmsize, 3*mmsize, 2*mmsize, 3*mmsize, 4*mmsize, 6*mmsize
michael@0 62 lea diffq, [diffq+diff_strideq*2]
michael@0 63 add predq, pred_str
michael@0 64 add srcq, src_strideq
michael@0 65 dec rowsd
michael@0 66 jg .loop_64
michael@0 67 RET
michael@0 68
michael@0 69 .case_32:
michael@0 70 mov pred_str, pred_stridemp
michael@0 71 .loop_32:
michael@0 72 loop16 0, mmsize, 0, mmsize, 0, 2*mmsize
michael@0 73 lea diffq, [diffq+diff_strideq*2]
michael@0 74 add predq, pred_str
michael@0 75 add srcq, src_strideq
michael@0 76 dec rowsd
michael@0 77 jg .loop_32
michael@0 78 RET
michael@0 79
michael@0 80 .case_16:
michael@0 81 mov pred_str, pred_stridemp
michael@0 82 .loop_16:
michael@0 83 loop16 0, src_strideq, 0, pred_str, 0, diff_strideq*2
michael@0 84 lea diffq, [diffq+diff_strideq*4]
michael@0 85 lea predq, [predq+pred_str*2]
michael@0 86 lea srcq, [srcq+src_strideq*2]
michael@0 87 sub rowsd, 2
michael@0 88 jg .loop_16
michael@0 89 RET
michael@0 90
michael@0 91 %macro loop_h 0
michael@0 92 movh m0, [srcq]
michael@0 93 movh m2, [srcq+src_strideq]
michael@0 94 movh m1, [predq]
michael@0 95 movh m3, [predq+pred_str]
michael@0 96 punpcklbw m0, m7
michael@0 97 punpcklbw m1, m7
michael@0 98 punpcklbw m2, m7
michael@0 99 punpcklbw m3, m7
michael@0 100 psubw m0, m1
michael@0 101 psubw m2, m3
michael@0 102 mova [diffq], m0
michael@0 103 mova [diffq+diff_strideq*2], m2
michael@0 104 %endmacro
michael@0 105
michael@0 106 .case_8:
michael@0 107 mov pred_str, pred_stridemp
michael@0 108 .loop_8:
michael@0 109 loop_h
michael@0 110 lea diffq, [diffq+diff_strideq*4]
michael@0 111 lea srcq, [srcq+src_strideq*2]
michael@0 112 lea predq, [predq+pred_str*2]
michael@0 113 sub rowsd, 2
michael@0 114 jg .loop_8
michael@0 115 RET
michael@0 116
michael@0 117 INIT_MMX
michael@0 118 .case_4:
michael@0 119 mov pred_str, pred_stridemp
michael@0 120 .loop_4:
michael@0 121 loop_h
michael@0 122 lea diffq, [diffq+diff_strideq*4]
michael@0 123 lea srcq, [srcq+src_strideq*2]
michael@0 124 lea predq, [predq+pred_str*2]
michael@0 125 sub rowsd, 2
michael@0 126 jg .loop_4
michael@0 127 RET

mercurial