|
1 ; |
|
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
|
3 ; |
|
4 ; Use of this source code is governed by a BSD-style license |
|
5 ; that can be found in the LICENSE file in the root of the source |
|
6 ; tree. An additional intellectual property rights grant can be found |
|
7 ; in the file PATENTS. All contributing project authors may |
|
8 ; be found in the AUTHORS file in the root of the source tree. |
|
9 ; |
|
10 |
|
11 %include "third_party/x86inc/x86inc.asm" |
|
12 |
|
13 SECTION .text |
|
14 |
|
15 %macro convolve_fn 1 |
|
16 INIT_XMM sse2 |
|
17 cglobal convolve_%1, 4, 7, 4, src, src_stride, dst, dst_stride, \ |
|
18 fx, fxs, fy, fys, w, h |
|
19 mov r4d, dword wm |
|
20 cmp r4d, 4 |
|
21 je .w4 |
|
22 cmp r4d, 8 |
|
23 je .w8 |
|
24 cmp r4d, 16 |
|
25 je .w16 |
|
26 cmp r4d, 32 |
|
27 je .w32 |
|
28 |
|
29 mov r4d, dword hm |
|
30 .loop64: |
|
31 movu m0, [srcq] |
|
32 movu m1, [srcq+16] |
|
33 movu m2, [srcq+32] |
|
34 movu m3, [srcq+48] |
|
35 add srcq, src_strideq |
|
36 %ifidn %1, avg |
|
37 pavgb m0, [dstq] |
|
38 pavgb m1, [dstq+16] |
|
39 pavgb m2, [dstq+32] |
|
40 pavgb m3, [dstq+48] |
|
41 %endif |
|
42 mova [dstq ], m0 |
|
43 mova [dstq+16], m1 |
|
44 mova [dstq+32], m2 |
|
45 mova [dstq+48], m3 |
|
46 add dstq, dst_strideq |
|
47 dec r4d |
|
48 jnz .loop64 |
|
49 RET |
|
50 |
|
51 .w32: |
|
52 mov r4d, dword hm |
|
53 .loop32: |
|
54 movu m0, [srcq] |
|
55 movu m1, [srcq+16] |
|
56 movu m2, [srcq+src_strideq] |
|
57 movu m3, [srcq+src_strideq+16] |
|
58 lea srcq, [srcq+src_strideq*2] |
|
59 %ifidn %1, avg |
|
60 pavgb m0, [dstq] |
|
61 pavgb m1, [dstq +16] |
|
62 pavgb m2, [dstq+dst_strideq] |
|
63 pavgb m3, [dstq+dst_strideq+16] |
|
64 %endif |
|
65 mova [dstq ], m0 |
|
66 mova [dstq +16], m1 |
|
67 mova [dstq+dst_strideq ], m2 |
|
68 mova [dstq+dst_strideq+16], m3 |
|
69 lea dstq, [dstq+dst_strideq*2] |
|
70 sub r4d, 2 |
|
71 jnz .loop32 |
|
72 RET |
|
73 |
|
74 .w16: |
|
75 mov r4d, dword hm |
|
76 lea r5q, [src_strideq*3] |
|
77 lea r6q, [dst_strideq*3] |
|
78 .loop16: |
|
79 movu m0, [srcq] |
|
80 movu m1, [srcq+src_strideq] |
|
81 movu m2, [srcq+src_strideq*2] |
|
82 movu m3, [srcq+r5q] |
|
83 lea srcq, [srcq+src_strideq*4] |
|
84 %ifidn %1, avg |
|
85 pavgb m0, [dstq] |
|
86 pavgb m1, [dstq+dst_strideq] |
|
87 pavgb m2, [dstq+dst_strideq*2] |
|
88 pavgb m3, [dstq+r6q] |
|
89 %endif |
|
90 mova [dstq ], m0 |
|
91 mova [dstq+dst_strideq ], m1 |
|
92 mova [dstq+dst_strideq*2], m2 |
|
93 mova [dstq+r6q ], m3 |
|
94 lea dstq, [dstq+dst_strideq*4] |
|
95 sub r4d, 4 |
|
96 jnz .loop16 |
|
97 RET |
|
98 |
|
99 INIT_MMX sse |
|
100 .w8: |
|
101 mov r4d, dword hm |
|
102 lea r5q, [src_strideq*3] |
|
103 lea r6q, [dst_strideq*3] |
|
104 .loop8: |
|
105 movu m0, [srcq] |
|
106 movu m1, [srcq+src_strideq] |
|
107 movu m2, [srcq+src_strideq*2] |
|
108 movu m3, [srcq+r5q] |
|
109 lea srcq, [srcq+src_strideq*4] |
|
110 %ifidn %1, avg |
|
111 pavgb m0, [dstq] |
|
112 pavgb m1, [dstq+dst_strideq] |
|
113 pavgb m2, [dstq+dst_strideq*2] |
|
114 pavgb m3, [dstq+r6q] |
|
115 %endif |
|
116 mova [dstq ], m0 |
|
117 mova [dstq+dst_strideq ], m1 |
|
118 mova [dstq+dst_strideq*2], m2 |
|
119 mova [dstq+r6q ], m3 |
|
120 lea dstq, [dstq+dst_strideq*4] |
|
121 sub r4d, 4 |
|
122 jnz .loop8 |
|
123 RET |
|
124 |
|
125 .w4: |
|
126 mov r4d, dword hm |
|
127 lea r5q, [src_strideq*3] |
|
128 lea r6q, [dst_strideq*3] |
|
129 .loop4: |
|
130 movh m0, [srcq] |
|
131 movh m1, [srcq+src_strideq] |
|
132 movh m2, [srcq+src_strideq*2] |
|
133 movh m3, [srcq+r5q] |
|
134 lea srcq, [srcq+src_strideq*4] |
|
135 %ifidn %1, avg |
|
136 pavgb m0, [dstq] |
|
137 pavgb m1, [dstq+dst_strideq] |
|
138 pavgb m2, [dstq+dst_strideq*2] |
|
139 pavgb m3, [dstq+r6q] |
|
140 %endif |
|
141 movh [dstq ], m0 |
|
142 movh [dstq+dst_strideq ], m1 |
|
143 movh [dstq+dst_strideq*2], m2 |
|
144 movh [dstq+r6q ], m3 |
|
145 lea dstq, [dstq+dst_strideq*4] |
|
146 sub r4d, 4 |
|
147 jnz .loop4 |
|
148 RET |
|
149 %endmacro |
|
150 |
|
151 convolve_fn copy |
|
152 convolve_fn avg |