Thu, 22 Jan 2015 13:21:57 +0100
Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6
1 /*
2 * Copyright 2012 The LibYuv Project Authors. All rights reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
11 #include "libyuv/basic_types.h"
12 #include "libyuv/row.h"
14 #ifdef __cplusplus
15 namespace libyuv {
16 extern "C" {
17 #endif
19 #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
21 __declspec(naked) __declspec(align(16))
22 uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
23 __asm {
24 mov eax, [esp + 4] // src_a
25 mov edx, [esp + 8] // src_b
26 mov ecx, [esp + 12] // count
27 pxor xmm0, xmm0
28 pxor xmm5, xmm5
30 align 4
31 wloop:
32 movdqa xmm1, [eax]
33 lea eax, [eax + 16]
34 movdqa xmm2, [edx]
35 lea edx, [edx + 16]
36 sub ecx, 16
37 movdqa xmm3, xmm1 // abs trick
38 psubusb xmm1, xmm2
39 psubusb xmm2, xmm3
40 por xmm1, xmm2
41 movdqa xmm2, xmm1
42 punpcklbw xmm1, xmm5
43 punpckhbw xmm2, xmm5
44 pmaddwd xmm1, xmm1
45 pmaddwd xmm2, xmm2
46 paddd xmm0, xmm1
47 paddd xmm0, xmm2
48 jg wloop
50 pshufd xmm1, xmm0, 0xee
51 paddd xmm0, xmm1
52 pshufd xmm1, xmm0, 0x01
53 paddd xmm0, xmm1
54 movd eax, xmm0
55 ret
56 }
57 }
59 // Visual C 2012 required for AVX2.
60 #if _MSC_VER >= 1700
61 // C4752: found Intel(R) Advanced Vector Extensions; consider using /arch:AVX.
62 #pragma warning(disable: 4752)
63 __declspec(naked) __declspec(align(16))
64 uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) {
65 __asm {
66 mov eax, [esp + 4] // src_a
67 mov edx, [esp + 8] // src_b
68 mov ecx, [esp + 12] // count
69 vpxor ymm0, ymm0, ymm0 // sum
70 vpxor ymm5, ymm5, ymm5 // constant 0 for unpck
71 sub edx, eax
73 align 4
74 wloop:
75 vmovdqu ymm1, [eax]
76 vmovdqu ymm2, [eax + edx]
77 lea eax, [eax + 32]
78 sub ecx, 32
79 vpsubusb ymm3, ymm1, ymm2 // abs difference trick
80 vpsubusb ymm2, ymm2, ymm1
81 vpor ymm1, ymm2, ymm3
82 vpunpcklbw ymm2, ymm1, ymm5 // u16. mutates order.
83 vpunpckhbw ymm1, ymm1, ymm5
84 vpmaddwd ymm2, ymm2, ymm2 // square + hadd to u32.
85 vpmaddwd ymm1, ymm1, ymm1
86 vpaddd ymm0, ymm0, ymm1
87 vpaddd ymm0, ymm0, ymm2
88 jg wloop
90 vpshufd ymm1, ymm0, 0xee // 3, 2 + 1, 0 both lanes.
91 vpaddd ymm0, ymm0, ymm1
92 vpshufd ymm1, ymm0, 0x01 // 1 + 0 both lanes.
93 vpaddd ymm0, ymm0, ymm1
94 vpermq ymm1, ymm0, 0x02 // high + low lane.
95 vpaddd ymm0, ymm0, ymm1
96 vmovd eax, xmm0
97 vzeroupper
98 ret
99 }
100 }
101 #endif // _MSC_VER >= 1700
103 #define HAS_HASHDJB2_SSE41
104 static uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 }; // 33 ^ 16
105 static uvec32 kHashMul0 = {
106 0x0c3525e1, // 33 ^ 15
107 0xa3476dc1, // 33 ^ 14
108 0x3b4039a1, // 33 ^ 13
109 0x4f5f0981, // 33 ^ 12
110 };
111 static uvec32 kHashMul1 = {
112 0x30f35d61, // 33 ^ 11
113 0x855cb541, // 33 ^ 10
114 0x040a9121, // 33 ^ 9
115 0x747c7101, // 33 ^ 8
116 };
117 static uvec32 kHashMul2 = {
118 0xec41d4e1, // 33 ^ 7
119 0x4cfa3cc1, // 33 ^ 6
120 0x025528a1, // 33 ^ 5
121 0x00121881, // 33 ^ 4
122 };
123 static uvec32 kHashMul3 = {
124 0x00008c61, // 33 ^ 3
125 0x00000441, // 33 ^ 2
126 0x00000021, // 33 ^ 1
127 0x00000001, // 33 ^ 0
128 };
130 // 27: 66 0F 38 40 C6 pmulld xmm0,xmm6
131 // 44: 66 0F 38 40 DD pmulld xmm3,xmm5
132 // 59: 66 0F 38 40 E5 pmulld xmm4,xmm5
133 // 72: 66 0F 38 40 D5 pmulld xmm2,xmm5
134 // 83: 66 0F 38 40 CD pmulld xmm1,xmm5
135 #define pmulld(reg) _asm _emit 0x66 _asm _emit 0x0F _asm _emit 0x38 \
136 _asm _emit 0x40 _asm _emit reg
138 __declspec(naked) __declspec(align(16))
139 uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
140 __asm {
141 mov eax, [esp + 4] // src
142 mov ecx, [esp + 8] // count
143 movd xmm0, [esp + 12] // seed
145 pxor xmm7, xmm7 // constant 0 for unpck
146 movdqa xmm6, kHash16x33
148 align 4
149 wloop:
150 movdqu xmm1, [eax] // src[0-15]
151 lea eax, [eax + 16]
152 pmulld(0xc6) // pmulld xmm0,xmm6 hash *= 33 ^ 16
153 movdqa xmm5, kHashMul0
154 movdqa xmm2, xmm1
155 punpcklbw xmm2, xmm7 // src[0-7]
156 movdqa xmm3, xmm2
157 punpcklwd xmm3, xmm7 // src[0-3]
158 pmulld(0xdd) // pmulld xmm3, xmm5
159 movdqa xmm5, kHashMul1
160 movdqa xmm4, xmm2
161 punpckhwd xmm4, xmm7 // src[4-7]
162 pmulld(0xe5) // pmulld xmm4, xmm5
163 movdqa xmm5, kHashMul2
164 punpckhbw xmm1, xmm7 // src[8-15]
165 movdqa xmm2, xmm1
166 punpcklwd xmm2, xmm7 // src[8-11]
167 pmulld(0xd5) // pmulld xmm2, xmm5
168 movdqa xmm5, kHashMul3
169 punpckhwd xmm1, xmm7 // src[12-15]
170 pmulld(0xcd) // pmulld xmm1, xmm5
171 paddd xmm3, xmm4 // add 16 results
172 paddd xmm1, xmm2
173 sub ecx, 16
174 paddd xmm1, xmm3
176 pshufd xmm2, xmm1, 0x0e // upper 2 dwords
177 paddd xmm1, xmm2
178 pshufd xmm2, xmm1, 0x01
179 paddd xmm1, xmm2
180 paddd xmm0, xmm1
181 jg wloop
183 movd eax, xmm0 // return hash
184 ret
185 }
186 }
188 // Visual C 2012 required for AVX2.
189 #if _MSC_VER >= 1700
190 __declspec(naked) __declspec(align(16))
191 uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) {
192 __asm {
193 mov eax, [esp + 4] // src
194 mov ecx, [esp + 8] // count
195 movd xmm0, [esp + 12] // seed
196 movdqa xmm6, kHash16x33
198 align 4
199 wloop:
200 vpmovzxbd xmm3, dword ptr [eax] // src[0-3]
201 pmulld xmm0, xmm6 // hash *= 33 ^ 16
202 vpmovzxbd xmm4, dword ptr [eax + 4] // src[4-7]
203 pmulld xmm3, kHashMul0
204 vpmovzxbd xmm2, dword ptr [eax + 8] // src[8-11]
205 pmulld xmm4, kHashMul1
206 vpmovzxbd xmm1, dword ptr [eax + 12] // src[12-15]
207 pmulld xmm2, kHashMul2
208 lea eax, [eax + 16]
209 pmulld xmm1, kHashMul3
210 paddd xmm3, xmm4 // add 16 results
211 paddd xmm1, xmm2
212 sub ecx, 16
213 paddd xmm1, xmm3
214 pshufd xmm2, xmm1, 0x0e // upper 2 dwords
215 paddd xmm1, xmm2
216 pshufd xmm2, xmm1, 0x01
217 paddd xmm1, xmm2
218 paddd xmm0, xmm1
219 jg wloop
221 movd eax, xmm0 // return hash
222 ret
223 }
224 }
225 #endif // _MSC_VER >= 1700
227 #endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
229 #ifdef __cplusplus
230 } // extern "C"
231 } // namespace libyuv
232 #endif