Thu, 22 Jan 2015 13:21:57 +0100
Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6
1 ;
2 ; jcsammmx.asm - downsampling (MMX)
3 ;
4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5 ;
6 ; Based on
7 ; x86 SIMD extension for IJG JPEG library
8 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
10 ;
11 ; This file should be assembled with NASM (Netwide Assembler),
12 ; can *not* be assembled with Microsoft's MASM or any compatible
13 ; assembler (including Borland's Turbo Assembler).
14 ; NASM is available from http://nasm.sourceforge.net/ or
15 ; http://sourceforge.net/project/showfiles.php?group_id=6208
16 ;
17 ; [TAB8]
19 %include "jsimdext.inc"
21 ; --------------------------------------------------------------------------
22 SECTION SEG_TEXT
23 BITS 32
24 ;
25 ; Downsample pixel values of a single component.
26 ; This version handles the common case of 2:1 horizontal and 1:1 vertical,
27 ; without smoothing.
28 ;
29 ; GLOBAL(void)
30 ; jsimd_h2v1_downsample_mmx (JDIMENSION image_width, int max_v_samp_factor,
31 ; JDIMENSION v_samp_factor, JDIMENSION width_blocks,
32 ; JSAMPARRAY input_data, JSAMPARRAY output_data);
33 ;
35 %define img_width(b) (b)+8 ; JDIMENSION image_width
36 %define max_v_samp(b) (b)+12 ; int max_v_samp_factor
37 %define v_samp(b) (b)+16 ; JDIMENSION v_samp_factor
38 %define width_blks(b) (b)+20 ; JDIMENSION width_blocks
39 %define input_data(b) (b)+24 ; JSAMPARRAY input_data
40 %define output_data(b) (b)+28 ; JSAMPARRAY output_data
42 align 16
43 global EXTN(jsimd_h2v1_downsample_mmx)
45 EXTN(jsimd_h2v1_downsample_mmx):
46 push ebp
47 mov ebp,esp
48 ; push ebx ; unused
49 ; push ecx ; need not be preserved
50 ; push edx ; need not be preserved
51 push esi
52 push edi
54 mov ecx, JDIMENSION [width_blks(ebp)]
55 shl ecx,3 ; imul ecx,DCTSIZE (ecx = output_cols)
56 jz near .return
58 mov edx, JDIMENSION [img_width(ebp)]
60 ; -- expand_right_edge
62 push ecx
63 shl ecx,1 ; output_cols * 2
64 sub ecx,edx
65 jle short .expand_end
67 mov eax, INT [max_v_samp(ebp)]
68 test eax,eax
69 jle short .expand_end
71 cld
72 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
73 alignx 16,7
74 .expandloop:
75 push eax
76 push ecx
78 mov edi, JSAMPROW [esi]
79 add edi,edx
80 mov al, JSAMPLE [edi-1]
82 rep stosb
84 pop ecx
85 pop eax
87 add esi, byte SIZEOF_JSAMPROW
88 dec eax
89 jg short .expandloop
91 .expand_end:
92 pop ecx ; output_cols
94 ; -- h2v1_downsample
96 mov eax, JDIMENSION [v_samp(ebp)] ; rowctr
97 test eax,eax
98 jle near .return
100 mov edx, 0x00010000 ; bias pattern
101 movd mm7,edx
102 pcmpeqw mm6,mm6
103 punpckldq mm7,mm7 ; mm7={0, 1, 0, 1}
104 psrlw mm6,BYTE_BIT ; mm6={0xFF 0x00 0xFF 0x00 ..}
106 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
107 mov edi, JSAMPARRAY [output_data(ebp)] ; output_data
108 alignx 16,7
109 .rowloop:
110 push ecx
111 push edi
112 push esi
114 mov esi, JSAMPROW [esi] ; inptr
115 mov edi, JSAMPROW [edi] ; outptr
116 alignx 16,7
117 .columnloop:
119 movq mm0, MMWORD [esi+0*SIZEOF_MMWORD]
120 movq mm1, MMWORD [esi+1*SIZEOF_MMWORD]
121 movq mm2,mm0
122 movq mm3,mm1
124 pand mm0,mm6
125 psrlw mm2,BYTE_BIT
126 pand mm1,mm6
127 psrlw mm3,BYTE_BIT
129 paddw mm0,mm2
130 paddw mm1,mm3
131 paddw mm0,mm7
132 paddw mm1,mm7
133 psrlw mm0,1
134 psrlw mm1,1
136 packuswb mm0,mm1
138 movq MMWORD [edi+0*SIZEOF_MMWORD], mm0
140 add esi, byte 2*SIZEOF_MMWORD ; inptr
141 add edi, byte 1*SIZEOF_MMWORD ; outptr
142 sub ecx, byte SIZEOF_MMWORD ; outcol
143 jnz short .columnloop
145 pop esi
146 pop edi
147 pop ecx
149 add esi, byte SIZEOF_JSAMPROW ; input_data
150 add edi, byte SIZEOF_JSAMPROW ; output_data
151 dec eax ; rowctr
152 jg short .rowloop
154 emms ; empty MMX state
156 .return:
157 pop edi
158 pop esi
159 ; pop edx ; need not be preserved
160 ; pop ecx ; need not be preserved
161 ; pop ebx ; unused
162 pop ebp
163 ret
165 ; --------------------------------------------------------------------------
166 ;
167 ; Downsample pixel values of a single component.
168 ; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
169 ; without smoothing.
170 ;
171 ; GLOBAL(void)
172 ; jsimd_h2v2_downsample_mmx (JDIMENSION image_width, int max_v_samp_factor,
173 ; JDIMENSION v_samp_factor, JDIMENSION width_blocks,
174 ; JSAMPARRAY input_data, JSAMPARRAY output_data);
175 ;
177 %define img_width(b) (b)+8 ; JDIMENSION image_width
178 %define max_v_samp(b) (b)+12 ; int max_v_samp_factor
179 %define v_samp(b) (b)+16 ; JDIMENSION v_samp_factor
180 %define width_blks(b) (b)+20 ; JDIMENSION width_blocks
181 %define input_data(b) (b)+24 ; JSAMPARRAY input_data
182 %define output_data(b) (b)+28 ; JSAMPARRAY output_data
184 align 16
185 global EXTN(jsimd_h2v2_downsample_mmx)
187 EXTN(jsimd_h2v2_downsample_mmx):
188 push ebp
189 mov ebp,esp
190 ; push ebx ; unused
191 ; push ecx ; need not be preserved
192 ; push edx ; need not be preserved
193 push esi
194 push edi
196 mov ecx, JDIMENSION [width_blks(ebp)]
197 shl ecx,3 ; imul ecx,DCTSIZE (ecx = output_cols)
198 jz near .return
200 mov edx, JDIMENSION [img_width(ebp)]
202 ; -- expand_right_edge
204 push ecx
205 shl ecx,1 ; output_cols * 2
206 sub ecx,edx
207 jle short .expand_end
209 mov eax, INT [max_v_samp(ebp)]
210 test eax,eax
211 jle short .expand_end
213 cld
214 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
215 alignx 16,7
216 .expandloop:
217 push eax
218 push ecx
220 mov edi, JSAMPROW [esi]
221 add edi,edx
222 mov al, JSAMPLE [edi-1]
224 rep stosb
226 pop ecx
227 pop eax
229 add esi, byte SIZEOF_JSAMPROW
230 dec eax
231 jg short .expandloop
233 .expand_end:
234 pop ecx ; output_cols
236 ; -- h2v2_downsample
238 mov eax, JDIMENSION [v_samp(ebp)] ; rowctr
239 test eax,eax
240 jle near .return
242 mov edx, 0x00020001 ; bias pattern
243 movd mm7,edx
244 pcmpeqw mm6,mm6
245 punpckldq mm7,mm7 ; mm7={1, 2, 1, 2}
246 psrlw mm6,BYTE_BIT ; mm6={0xFF 0x00 0xFF 0x00 ..}
248 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
249 mov edi, JSAMPARRAY [output_data(ebp)] ; output_data
250 alignx 16,7
251 .rowloop:
252 push ecx
253 push edi
254 push esi
256 mov edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0
257 mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1
258 mov edi, JSAMPROW [edi] ; outptr
259 alignx 16,7
260 .columnloop:
262 movq mm0, MMWORD [edx+0*SIZEOF_MMWORD]
263 movq mm1, MMWORD [esi+0*SIZEOF_MMWORD]
264 movq mm2, MMWORD [edx+1*SIZEOF_MMWORD]
265 movq mm3, MMWORD [esi+1*SIZEOF_MMWORD]
267 movq mm4,mm0
268 movq mm5,mm1
269 pand mm0,mm6
270 psrlw mm4,BYTE_BIT
271 pand mm1,mm6
272 psrlw mm5,BYTE_BIT
273 paddw mm0,mm4
274 paddw mm1,mm5
276 movq mm4,mm2
277 movq mm5,mm3
278 pand mm2,mm6
279 psrlw mm4,BYTE_BIT
280 pand mm3,mm6
281 psrlw mm5,BYTE_BIT
282 paddw mm2,mm4
283 paddw mm3,mm5
285 paddw mm0,mm1
286 paddw mm2,mm3
287 paddw mm0,mm7
288 paddw mm2,mm7
289 psrlw mm0,2
290 psrlw mm2,2
292 packuswb mm0,mm2
294 movq MMWORD [edi+0*SIZEOF_MMWORD], mm0
296 add edx, byte 2*SIZEOF_MMWORD ; inptr0
297 add esi, byte 2*SIZEOF_MMWORD ; inptr1
298 add edi, byte 1*SIZEOF_MMWORD ; outptr
299 sub ecx, byte SIZEOF_MMWORD ; outcol
300 jnz near .columnloop
302 pop esi
303 pop edi
304 pop ecx
306 add esi, byte 2*SIZEOF_JSAMPROW ; input_data
307 add edi, byte 1*SIZEOF_JSAMPROW ; output_data
308 dec eax ; rowctr
309 jg near .rowloop
311 emms ; empty MMX state
313 .return:
314 pop edi
315 pop esi
316 ; pop edx ; need not be preserved
317 ; pop ecx ; need not be preserved
318 ; pop ebx ; unused
319 pop ebp
320 ret
322 ; For some reason, the OS X linker does not honor the request to align the
323 ; segment unless we do this.
324 align 16