|
1 ; |
|
2 ; jdsammmx.asm - upsampling (MMX) |
|
3 ; |
|
4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB |
|
5 ; |
|
6 ; Based on |
|
7 ; x86 SIMD extension for IJG JPEG library |
|
8 ; Copyright (C) 1999-2006, MIYASAKA Masaru. |
|
9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc |
|
10 ; |
|
11 ; This file should be assembled with NASM (Netwide Assembler), |
|
12 ; can *not* be assembled with Microsoft's MASM or any compatible |
|
13 ; assembler (including Borland's Turbo Assembler). |
|
14 ; NASM is available from http://nasm.sourceforge.net/ or |
|
15 ; http://sourceforge.net/project/showfiles.php?group_id=6208 |
|
16 ; |
|
17 ; [TAB8] |
|
18 |
|
19 %include "jsimdext.inc" |
|
20 |
|
21 ; -------------------------------------------------------------------------- |
|
22 SECTION SEG_CONST |
|
23 |
|
24 alignz 16 |
|
25 global EXTN(jconst_fancy_upsample_mmx) |
|
26 |
|
27 EXTN(jconst_fancy_upsample_mmx): |
|
28 |
|
29 PW_ONE times 4 dw 1 |
|
30 PW_TWO times 4 dw 2 |
|
31 PW_THREE times 4 dw 3 |
|
32 PW_SEVEN times 4 dw 7 |
|
33 PW_EIGHT times 4 dw 8 |
|
34 |
|
35 alignz 16 |
|
36 |
|
37 ; -------------------------------------------------------------------------- |
|
38 SECTION SEG_TEXT |
|
39 BITS 32 |
|
40 ; |
|
41 ; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical. |
|
42 ; |
|
43 ; The upsampling algorithm is linear interpolation between pixel centers, |
|
44 ; also known as a "triangle filter". This is a good compromise between |
|
45 ; speed and visual quality. The centers of the output pixels are 1/4 and 3/4 |
|
46 ; of the way between input pixel centers. |
|
47 ; |
|
48 ; GLOBAL(void) |
|
49 ; jsimd_h2v1_fancy_upsample_mmx (int max_v_samp_factor, |
|
50 ; JDIMENSION downsampled_width, |
|
51 ; JSAMPARRAY input_data, |
|
52 ; JSAMPARRAY * output_data_ptr); |
|
53 ; |
|
54 |
|
55 %define max_v_samp(b) (b)+8 ; int max_v_samp_factor |
|
56 %define downsamp_width(b) (b)+12 ; JDIMENSION downsampled_width |
|
57 %define input_data(b) (b)+16 ; JSAMPARRAY input_data |
|
58 %define output_data_ptr(b) (b)+20 ; JSAMPARRAY * output_data_ptr |
|
59 |
|
60 align 16 |
|
61 global EXTN(jsimd_h2v1_fancy_upsample_mmx) |
|
62 |
|
63 EXTN(jsimd_h2v1_fancy_upsample_mmx): |
|
64 push ebp |
|
65 mov ebp,esp |
|
66 pushpic ebx |
|
67 ; push ecx ; need not be preserved |
|
68 ; push edx ; need not be preserved |
|
69 push esi |
|
70 push edi |
|
71 |
|
72 get_GOT ebx ; get GOT address |
|
73 |
|
74 mov eax, JDIMENSION [downsamp_width(ebp)] ; colctr |
|
75 test eax,eax |
|
76 jz near .return |
|
77 |
|
78 mov ecx, INT [max_v_samp(ebp)] ; rowctr |
|
79 test ecx,ecx |
|
80 jz near .return |
|
81 |
|
82 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data |
|
83 mov edi, POINTER [output_data_ptr(ebp)] |
|
84 mov edi, JSAMPARRAY [edi] ; output_data |
|
85 alignx 16,7 |
|
86 .rowloop: |
|
87 push eax ; colctr |
|
88 push edi |
|
89 push esi |
|
90 |
|
91 mov esi, JSAMPROW [esi] ; inptr |
|
92 mov edi, JSAMPROW [edi] ; outptr |
|
93 |
|
94 test eax, SIZEOF_MMWORD-1 |
|
95 jz short .skip |
|
96 mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE] |
|
97 mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample |
|
98 .skip: |
|
99 pxor mm0,mm0 ; mm0=(all 0's) |
|
100 pcmpeqb mm7,mm7 |
|
101 psrlq mm7,(SIZEOF_MMWORD-1)*BYTE_BIT |
|
102 pand mm7, MMWORD [esi+0*SIZEOF_MMWORD] |
|
103 |
|
104 add eax, byte SIZEOF_MMWORD-1 |
|
105 and eax, byte -SIZEOF_MMWORD |
|
106 cmp eax, byte SIZEOF_MMWORD |
|
107 ja short .columnloop |
|
108 alignx 16,7 |
|
109 |
|
110 .columnloop_last: |
|
111 pcmpeqb mm6,mm6 |
|
112 psllq mm6,(SIZEOF_MMWORD-1)*BYTE_BIT |
|
113 pand mm6, MMWORD [esi+0*SIZEOF_MMWORD] |
|
114 jmp short .upsample |
|
115 alignx 16,7 |
|
116 |
|
117 .columnloop: |
|
118 movq mm6, MMWORD [esi+1*SIZEOF_MMWORD] |
|
119 psllq mm6,(SIZEOF_MMWORD-1)*BYTE_BIT |
|
120 |
|
121 .upsample: |
|
122 movq mm1, MMWORD [esi+0*SIZEOF_MMWORD] |
|
123 movq mm2,mm1 |
|
124 movq mm3,mm1 ; mm1=( 0 1 2 3 4 5 6 7) |
|
125 psllq mm2,BYTE_BIT ; mm2=( - 0 1 2 3 4 5 6) |
|
126 psrlq mm3,BYTE_BIT ; mm3=( 1 2 3 4 5 6 7 -) |
|
127 |
|
128 por mm2,mm7 ; mm2=(-1 0 1 2 3 4 5 6) |
|
129 por mm3,mm6 ; mm3=( 1 2 3 4 5 6 7 8) |
|
130 |
|
131 movq mm7,mm1 |
|
132 psrlq mm7,(SIZEOF_MMWORD-1)*BYTE_BIT ; mm7=( 7 - - - - - - -) |
|
133 |
|
134 movq mm4,mm1 |
|
135 punpcklbw mm1,mm0 ; mm1=( 0 1 2 3) |
|
136 punpckhbw mm4,mm0 ; mm4=( 4 5 6 7) |
|
137 movq mm5,mm2 |
|
138 punpcklbw mm2,mm0 ; mm2=(-1 0 1 2) |
|
139 punpckhbw mm5,mm0 ; mm5=( 3 4 5 6) |
|
140 movq mm6,mm3 |
|
141 punpcklbw mm3,mm0 ; mm3=( 1 2 3 4) |
|
142 punpckhbw mm6,mm0 ; mm6=( 5 6 7 8) |
|
143 |
|
144 pmullw mm1,[GOTOFF(ebx,PW_THREE)] |
|
145 pmullw mm4,[GOTOFF(ebx,PW_THREE)] |
|
146 paddw mm2,[GOTOFF(ebx,PW_ONE)] |
|
147 paddw mm5,[GOTOFF(ebx,PW_ONE)] |
|
148 paddw mm3,[GOTOFF(ebx,PW_TWO)] |
|
149 paddw mm6,[GOTOFF(ebx,PW_TWO)] |
|
150 |
|
151 paddw mm2,mm1 |
|
152 paddw mm5,mm4 |
|
153 psrlw mm2,2 ; mm2=OutLE=( 0 2 4 6) |
|
154 psrlw mm5,2 ; mm5=OutHE=( 8 10 12 14) |
|
155 paddw mm3,mm1 |
|
156 paddw mm6,mm4 |
|
157 psrlw mm3,2 ; mm3=OutLO=( 1 3 5 7) |
|
158 psrlw mm6,2 ; mm6=OutHO=( 9 11 13 15) |
|
159 |
|
160 psllw mm3,BYTE_BIT |
|
161 psllw mm6,BYTE_BIT |
|
162 por mm2,mm3 ; mm2=OutL=( 0 1 2 3 4 5 6 7) |
|
163 por mm5,mm6 ; mm5=OutH=( 8 9 10 11 12 13 14 15) |
|
164 |
|
165 movq MMWORD [edi+0*SIZEOF_MMWORD], mm2 |
|
166 movq MMWORD [edi+1*SIZEOF_MMWORD], mm5 |
|
167 |
|
168 sub eax, byte SIZEOF_MMWORD |
|
169 add esi, byte 1*SIZEOF_MMWORD ; inptr |
|
170 add edi, byte 2*SIZEOF_MMWORD ; outptr |
|
171 cmp eax, byte SIZEOF_MMWORD |
|
172 ja near .columnloop |
|
173 test eax,eax |
|
174 jnz near .columnloop_last |
|
175 |
|
176 pop esi |
|
177 pop edi |
|
178 pop eax |
|
179 |
|
180 add esi, byte SIZEOF_JSAMPROW ; input_data |
|
181 add edi, byte SIZEOF_JSAMPROW ; output_data |
|
182 dec ecx ; rowctr |
|
183 jg near .rowloop |
|
184 |
|
185 emms ; empty MMX state |
|
186 |
|
187 .return: |
|
188 pop edi |
|
189 pop esi |
|
190 ; pop edx ; need not be preserved |
|
191 ; pop ecx ; need not be preserved |
|
192 poppic ebx |
|
193 pop ebp |
|
194 ret |
|
195 |
|
196 ; -------------------------------------------------------------------------- |
|
197 ; |
|
198 ; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical. |
|
199 ; Again a triangle filter; see comments for h2v1 case, above. |
|
200 ; |
|
201 ; GLOBAL(void) |
|
202 ; jsimd_h2v2_fancy_upsample_mmx (int max_v_samp_factor, |
|
203 ; JDIMENSION downsampled_width, |
|
204 ; JSAMPARRAY input_data, |
|
205 ; JSAMPARRAY * output_data_ptr); |
|
206 ; |
|
207 |
|
208 %define max_v_samp(b) (b)+8 ; int max_v_samp_factor |
|
209 %define downsamp_width(b) (b)+12 ; JDIMENSION downsampled_width |
|
210 %define input_data(b) (b)+16 ; JSAMPARRAY input_data |
|
211 %define output_data_ptr(b) (b)+20 ; JSAMPARRAY * output_data_ptr |
|
212 |
|
213 %define original_ebp ebp+0 |
|
214 %define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM] |
|
215 %define WK_NUM 4 |
|
216 %define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr |
|
217 |
|
218 align 16 |
|
219 global EXTN(jsimd_h2v2_fancy_upsample_mmx) |
|
220 |
|
221 EXTN(jsimd_h2v2_fancy_upsample_mmx): |
|
222 push ebp |
|
223 mov eax,esp ; eax = original ebp |
|
224 sub esp, byte 4 |
|
225 and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits |
|
226 mov [esp],eax |
|
227 mov ebp,esp ; ebp = aligned ebp |
|
228 lea esp, [wk(0)] |
|
229 pushpic eax ; make a room for GOT address |
|
230 push ebx |
|
231 ; push ecx ; need not be preserved |
|
232 ; push edx ; need not be preserved |
|
233 push esi |
|
234 push edi |
|
235 |
|
236 get_GOT ebx ; get GOT address |
|
237 movpic POINTER [gotptr], ebx ; save GOT address |
|
238 |
|
239 mov edx,eax ; edx = original ebp |
|
240 mov eax, JDIMENSION [downsamp_width(edx)] ; colctr |
|
241 test eax,eax |
|
242 jz near .return |
|
243 |
|
244 mov ecx, INT [max_v_samp(edx)] ; rowctr |
|
245 test ecx,ecx |
|
246 jz near .return |
|
247 |
|
248 mov esi, JSAMPARRAY [input_data(edx)] ; input_data |
|
249 mov edi, POINTER [output_data_ptr(edx)] |
|
250 mov edi, JSAMPARRAY [edi] ; output_data |
|
251 alignx 16,7 |
|
252 .rowloop: |
|
253 push eax ; colctr |
|
254 push ecx |
|
255 push edi |
|
256 push esi |
|
257 |
|
258 mov ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW] ; inptr1(above) |
|
259 mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0 |
|
260 mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1(below) |
|
261 mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0 |
|
262 mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1 |
|
263 |
|
264 test eax, SIZEOF_MMWORD-1 |
|
265 jz short .skip |
|
266 push edx |
|
267 mov dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE] |
|
268 mov JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl |
|
269 mov dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE] |
|
270 mov JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl |
|
271 mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE] |
|
272 mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample |
|
273 pop edx |
|
274 .skip: |
|
275 ; -- process the first column block |
|
276 |
|
277 movq mm0, MMWORD [ebx+0*SIZEOF_MMWORD] ; mm0=row[ 0][0] |
|
278 movq mm1, MMWORD [ecx+0*SIZEOF_MMWORD] ; mm1=row[-1][0] |
|
279 movq mm2, MMWORD [esi+0*SIZEOF_MMWORD] ; mm2=row[+1][0] |
|
280 |
|
281 pushpic ebx |
|
282 movpic ebx, POINTER [gotptr] ; load GOT address |
|
283 |
|
284 pxor mm3,mm3 ; mm3=(all 0's) |
|
285 movq mm4,mm0 |
|
286 punpcklbw mm0,mm3 ; mm0=row[ 0][0]( 0 1 2 3) |
|
287 punpckhbw mm4,mm3 ; mm4=row[ 0][0]( 4 5 6 7) |
|
288 movq mm5,mm1 |
|
289 punpcklbw mm1,mm3 ; mm1=row[-1][0]( 0 1 2 3) |
|
290 punpckhbw mm5,mm3 ; mm5=row[-1][0]( 4 5 6 7) |
|
291 movq mm6,mm2 |
|
292 punpcklbw mm2,mm3 ; mm2=row[+1][0]( 0 1 2 3) |
|
293 punpckhbw mm6,mm3 ; mm6=row[+1][0]( 4 5 6 7) |
|
294 |
|
295 pmullw mm0,[GOTOFF(ebx,PW_THREE)] |
|
296 pmullw mm4,[GOTOFF(ebx,PW_THREE)] |
|
297 |
|
298 pcmpeqb mm7,mm7 |
|
299 psrlq mm7,(SIZEOF_MMWORD-2)*BYTE_BIT |
|
300 |
|
301 paddw mm1,mm0 ; mm1=Int0L=( 0 1 2 3) |
|
302 paddw mm5,mm4 ; mm5=Int0H=( 4 5 6 7) |
|
303 paddw mm2,mm0 ; mm2=Int1L=( 0 1 2 3) |
|
304 paddw mm6,mm4 ; mm6=Int1H=( 4 5 6 7) |
|
305 |
|
306 movq MMWORD [edx+0*SIZEOF_MMWORD], mm1 ; temporarily save |
|
307 movq MMWORD [edx+1*SIZEOF_MMWORD], mm5 ; the intermediate data |
|
308 movq MMWORD [edi+0*SIZEOF_MMWORD], mm2 |
|
309 movq MMWORD [edi+1*SIZEOF_MMWORD], mm6 |
|
310 |
|
311 pand mm1,mm7 ; mm1=( 0 - - -) |
|
312 pand mm2,mm7 ; mm2=( 0 - - -) |
|
313 |
|
314 movq MMWORD [wk(0)], mm1 |
|
315 movq MMWORD [wk(1)], mm2 |
|
316 |
|
317 poppic ebx |
|
318 |
|
319 add eax, byte SIZEOF_MMWORD-1 |
|
320 and eax, byte -SIZEOF_MMWORD |
|
321 cmp eax, byte SIZEOF_MMWORD |
|
322 ja short .columnloop |
|
323 alignx 16,7 |
|
324 |
|
325 .columnloop_last: |
|
326 ; -- process the last column block |
|
327 |
|
328 pushpic ebx |
|
329 movpic ebx, POINTER [gotptr] ; load GOT address |
|
330 |
|
331 pcmpeqb mm1,mm1 |
|
332 psllq mm1,(SIZEOF_MMWORD-2)*BYTE_BIT |
|
333 movq mm2,mm1 |
|
334 |
|
335 pand mm1, MMWORD [edx+1*SIZEOF_MMWORD] ; mm1=( - - - 7) |
|
336 pand mm2, MMWORD [edi+1*SIZEOF_MMWORD] ; mm2=( - - - 7) |
|
337 |
|
338 movq MMWORD [wk(2)], mm1 |
|
339 movq MMWORD [wk(3)], mm2 |
|
340 |
|
341 jmp short .upsample |
|
342 alignx 16,7 |
|
343 |
|
344 .columnloop: |
|
345 ; -- process the next column block |
|
346 |
|
347 movq mm0, MMWORD [ebx+1*SIZEOF_MMWORD] ; mm0=row[ 0][1] |
|
348 movq mm1, MMWORD [ecx+1*SIZEOF_MMWORD] ; mm1=row[-1][1] |
|
349 movq mm2, MMWORD [esi+1*SIZEOF_MMWORD] ; mm2=row[+1][1] |
|
350 |
|
351 pushpic ebx |
|
352 movpic ebx, POINTER [gotptr] ; load GOT address |
|
353 |
|
354 pxor mm3,mm3 ; mm3=(all 0's) |
|
355 movq mm4,mm0 |
|
356 punpcklbw mm0,mm3 ; mm0=row[ 0][1]( 0 1 2 3) |
|
357 punpckhbw mm4,mm3 ; mm4=row[ 0][1]( 4 5 6 7) |
|
358 movq mm5,mm1 |
|
359 punpcklbw mm1,mm3 ; mm1=row[-1][1]( 0 1 2 3) |
|
360 punpckhbw mm5,mm3 ; mm5=row[-1][1]( 4 5 6 7) |
|
361 movq mm6,mm2 |
|
362 punpcklbw mm2,mm3 ; mm2=row[+1][1]( 0 1 2 3) |
|
363 punpckhbw mm6,mm3 ; mm6=row[+1][1]( 4 5 6 7) |
|
364 |
|
365 pmullw mm0,[GOTOFF(ebx,PW_THREE)] |
|
366 pmullw mm4,[GOTOFF(ebx,PW_THREE)] |
|
367 |
|
368 paddw mm1,mm0 ; mm1=Int0L=( 0 1 2 3) |
|
369 paddw mm5,mm4 ; mm5=Int0H=( 4 5 6 7) |
|
370 paddw mm2,mm0 ; mm2=Int1L=( 0 1 2 3) |
|
371 paddw mm6,mm4 ; mm6=Int1H=( 4 5 6 7) |
|
372 |
|
373 movq MMWORD [edx+2*SIZEOF_MMWORD], mm1 ; temporarily save |
|
374 movq MMWORD [edx+3*SIZEOF_MMWORD], mm5 ; the intermediate data |
|
375 movq MMWORD [edi+2*SIZEOF_MMWORD], mm2 |
|
376 movq MMWORD [edi+3*SIZEOF_MMWORD], mm6 |
|
377 |
|
378 psllq mm1,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm1=( - - - 0) |
|
379 psllq mm2,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm2=( - - - 0) |
|
380 |
|
381 movq MMWORD [wk(2)], mm1 |
|
382 movq MMWORD [wk(3)], mm2 |
|
383 |
|
384 .upsample: |
|
385 ; -- process the upper row |
|
386 |
|
387 movq mm7, MMWORD [edx+0*SIZEOF_MMWORD] ; mm7=Int0L=( 0 1 2 3) |
|
388 movq mm3, MMWORD [edx+1*SIZEOF_MMWORD] ; mm3=Int0H=( 4 5 6 7) |
|
389 |
|
390 movq mm0,mm7 |
|
391 movq mm4,mm3 |
|
392 psrlq mm0,2*BYTE_BIT ; mm0=( 1 2 3 -) |
|
393 psllq mm4,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm4=( - - - 4) |
|
394 movq mm5,mm7 |
|
395 movq mm6,mm3 |
|
396 psrlq mm5,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm5=( 3 - - -) |
|
397 psllq mm6,2*BYTE_BIT ; mm6=( - 4 5 6) |
|
398 |
|
399 por mm0,mm4 ; mm0=( 1 2 3 4) |
|
400 por mm5,mm6 ; mm5=( 3 4 5 6) |
|
401 |
|
402 movq mm1,mm7 |
|
403 movq mm2,mm3 |
|
404 psllq mm1,2*BYTE_BIT ; mm1=( - 0 1 2) |
|
405 psrlq mm2,2*BYTE_BIT ; mm2=( 5 6 7 -) |
|
406 movq mm4,mm3 |
|
407 psrlq mm4,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm4=( 7 - - -) |
|
408 |
|
409 por mm1, MMWORD [wk(0)] ; mm1=(-1 0 1 2) |
|
410 por mm2, MMWORD [wk(2)] ; mm2=( 5 6 7 8) |
|
411 |
|
412 movq MMWORD [wk(0)], mm4 |
|
413 |
|
414 pmullw mm7,[GOTOFF(ebx,PW_THREE)] |
|
415 pmullw mm3,[GOTOFF(ebx,PW_THREE)] |
|
416 paddw mm1,[GOTOFF(ebx,PW_EIGHT)] |
|
417 paddw mm5,[GOTOFF(ebx,PW_EIGHT)] |
|
418 paddw mm0,[GOTOFF(ebx,PW_SEVEN)] |
|
419 paddw mm2,[GOTOFF(ebx,PW_SEVEN)] |
|
420 |
|
421 paddw mm1,mm7 |
|
422 paddw mm5,mm3 |
|
423 psrlw mm1,4 ; mm1=Out0LE=( 0 2 4 6) |
|
424 psrlw mm5,4 ; mm5=Out0HE=( 8 10 12 14) |
|
425 paddw mm0,mm7 |
|
426 paddw mm2,mm3 |
|
427 psrlw mm0,4 ; mm0=Out0LO=( 1 3 5 7) |
|
428 psrlw mm2,4 ; mm2=Out0HO=( 9 11 13 15) |
|
429 |
|
430 psllw mm0,BYTE_BIT |
|
431 psllw mm2,BYTE_BIT |
|
432 por mm1,mm0 ; mm1=Out0L=( 0 1 2 3 4 5 6 7) |
|
433 por mm5,mm2 ; mm5=Out0H=( 8 9 10 11 12 13 14 15) |
|
434 |
|
435 movq MMWORD [edx+0*SIZEOF_MMWORD], mm1 |
|
436 movq MMWORD [edx+1*SIZEOF_MMWORD], mm5 |
|
437 |
|
438 ; -- process the lower row |
|
439 |
|
440 movq mm6, MMWORD [edi+0*SIZEOF_MMWORD] ; mm6=Int1L=( 0 1 2 3) |
|
441 movq mm4, MMWORD [edi+1*SIZEOF_MMWORD] ; mm4=Int1H=( 4 5 6 7) |
|
442 |
|
443 movq mm7,mm6 |
|
444 movq mm3,mm4 |
|
445 psrlq mm7,2*BYTE_BIT ; mm7=( 1 2 3 -) |
|
446 psllq mm3,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm3=( - - - 4) |
|
447 movq mm0,mm6 |
|
448 movq mm2,mm4 |
|
449 psrlq mm0,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm0=( 3 - - -) |
|
450 psllq mm2,2*BYTE_BIT ; mm2=( - 4 5 6) |
|
451 |
|
452 por mm7,mm3 ; mm7=( 1 2 3 4) |
|
453 por mm0,mm2 ; mm0=( 3 4 5 6) |
|
454 |
|
455 movq mm1,mm6 |
|
456 movq mm5,mm4 |
|
457 psllq mm1,2*BYTE_BIT ; mm1=( - 0 1 2) |
|
458 psrlq mm5,2*BYTE_BIT ; mm5=( 5 6 7 -) |
|
459 movq mm3,mm4 |
|
460 psrlq mm3,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm3=( 7 - - -) |
|
461 |
|
462 por mm1, MMWORD [wk(1)] ; mm1=(-1 0 1 2) |
|
463 por mm5, MMWORD [wk(3)] ; mm5=( 5 6 7 8) |
|
464 |
|
465 movq MMWORD [wk(1)], mm3 |
|
466 |
|
467 pmullw mm6,[GOTOFF(ebx,PW_THREE)] |
|
468 pmullw mm4,[GOTOFF(ebx,PW_THREE)] |
|
469 paddw mm1,[GOTOFF(ebx,PW_EIGHT)] |
|
470 paddw mm0,[GOTOFF(ebx,PW_EIGHT)] |
|
471 paddw mm7,[GOTOFF(ebx,PW_SEVEN)] |
|
472 paddw mm5,[GOTOFF(ebx,PW_SEVEN)] |
|
473 |
|
474 paddw mm1,mm6 |
|
475 paddw mm0,mm4 |
|
476 psrlw mm1,4 ; mm1=Out1LE=( 0 2 4 6) |
|
477 psrlw mm0,4 ; mm0=Out1HE=( 8 10 12 14) |
|
478 paddw mm7,mm6 |
|
479 paddw mm5,mm4 |
|
480 psrlw mm7,4 ; mm7=Out1LO=( 1 3 5 7) |
|
481 psrlw mm5,4 ; mm5=Out1HO=( 9 11 13 15) |
|
482 |
|
483 psllw mm7,BYTE_BIT |
|
484 psllw mm5,BYTE_BIT |
|
485 por mm1,mm7 ; mm1=Out1L=( 0 1 2 3 4 5 6 7) |
|
486 por mm0,mm5 ; mm0=Out1H=( 8 9 10 11 12 13 14 15) |
|
487 |
|
488 movq MMWORD [edi+0*SIZEOF_MMWORD], mm1 |
|
489 movq MMWORD [edi+1*SIZEOF_MMWORD], mm0 |
|
490 |
|
491 poppic ebx |
|
492 |
|
493 sub eax, byte SIZEOF_MMWORD |
|
494 add ecx, byte 1*SIZEOF_MMWORD ; inptr1(above) |
|
495 add ebx, byte 1*SIZEOF_MMWORD ; inptr0 |
|
496 add esi, byte 1*SIZEOF_MMWORD ; inptr1(below) |
|
497 add edx, byte 2*SIZEOF_MMWORD ; outptr0 |
|
498 add edi, byte 2*SIZEOF_MMWORD ; outptr1 |
|
499 cmp eax, byte SIZEOF_MMWORD |
|
500 ja near .columnloop |
|
501 test eax,eax |
|
502 jnz near .columnloop_last |
|
503 |
|
504 pop esi |
|
505 pop edi |
|
506 pop ecx |
|
507 pop eax |
|
508 |
|
509 add esi, byte 1*SIZEOF_JSAMPROW ; input_data |
|
510 add edi, byte 2*SIZEOF_JSAMPROW ; output_data |
|
511 sub ecx, byte 2 ; rowctr |
|
512 jg near .rowloop |
|
513 |
|
514 emms ; empty MMX state |
|
515 |
|
516 .return: |
|
517 pop edi |
|
518 pop esi |
|
519 ; pop edx ; need not be preserved |
|
520 ; pop ecx ; need not be preserved |
|
521 pop ebx |
|
522 mov esp,ebp ; esp <- aligned ebp |
|
523 pop esp ; esp <- original ebp |
|
524 pop ebp |
|
525 ret |
|
526 |
|
527 ; -------------------------------------------------------------------------- |
|
528 ; |
|
529 ; Fast processing for the common case of 2:1 horizontal and 1:1 vertical. |
|
530 ; It's still a box filter. |
|
531 ; |
|
532 ; GLOBAL(void) |
|
533 ; jsimd_h2v1_upsample_mmx (int max_v_samp_factor, |
|
534 ; JDIMENSION output_width, |
|
535 ; JSAMPARRAY input_data, |
|
536 ; JSAMPARRAY * output_data_ptr); |
|
537 ; |
|
538 |
|
539 %define max_v_samp(b) (b)+8 ; int max_v_samp_factor |
|
540 %define output_width(b) (b)+12 ; JDIMENSION output_width |
|
541 %define input_data(b) (b)+16 ; JSAMPARRAY input_data |
|
542 %define output_data_ptr(b) (b)+20 ; JSAMPARRAY * output_data_ptr |
|
543 |
|
544 align 16 |
|
545 global EXTN(jsimd_h2v1_upsample_mmx) |
|
546 |
|
547 EXTN(jsimd_h2v1_upsample_mmx): |
|
548 push ebp |
|
549 mov ebp,esp |
|
550 ; push ebx ; unused |
|
551 ; push ecx ; need not be preserved |
|
552 ; push edx ; need not be preserved |
|
553 push esi |
|
554 push edi |
|
555 |
|
556 mov edx, JDIMENSION [output_width(ebp)] |
|
557 add edx, byte (2*SIZEOF_MMWORD)-1 |
|
558 and edx, byte -(2*SIZEOF_MMWORD) |
|
559 jz short .return |
|
560 |
|
561 mov ecx, INT [max_v_samp(ebp)] ; rowctr |
|
562 test ecx,ecx |
|
563 jz short .return |
|
564 |
|
565 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data |
|
566 mov edi, POINTER [output_data_ptr(ebp)] |
|
567 mov edi, JSAMPARRAY [edi] ; output_data |
|
568 alignx 16,7 |
|
569 .rowloop: |
|
570 push edi |
|
571 push esi |
|
572 |
|
573 mov esi, JSAMPROW [esi] ; inptr |
|
574 mov edi, JSAMPROW [edi] ; outptr |
|
575 mov eax,edx ; colctr |
|
576 alignx 16,7 |
|
577 .columnloop: |
|
578 |
|
579 movq mm0, MMWORD [esi+0*SIZEOF_MMWORD] |
|
580 |
|
581 movq mm1,mm0 |
|
582 punpcklbw mm0,mm0 |
|
583 punpckhbw mm1,mm1 |
|
584 |
|
585 movq MMWORD [edi+0*SIZEOF_MMWORD], mm0 |
|
586 movq MMWORD [edi+1*SIZEOF_MMWORD], mm1 |
|
587 |
|
588 sub eax, byte 2*SIZEOF_MMWORD |
|
589 jz short .nextrow |
|
590 |
|
591 movq mm2, MMWORD [esi+1*SIZEOF_MMWORD] |
|
592 |
|
593 movq mm3,mm2 |
|
594 punpcklbw mm2,mm2 |
|
595 punpckhbw mm3,mm3 |
|
596 |
|
597 movq MMWORD [edi+2*SIZEOF_MMWORD], mm2 |
|
598 movq MMWORD [edi+3*SIZEOF_MMWORD], mm3 |
|
599 |
|
600 sub eax, byte 2*SIZEOF_MMWORD |
|
601 jz short .nextrow |
|
602 |
|
603 add esi, byte 2*SIZEOF_MMWORD ; inptr |
|
604 add edi, byte 4*SIZEOF_MMWORD ; outptr |
|
605 jmp short .columnloop |
|
606 alignx 16,7 |
|
607 |
|
608 .nextrow: |
|
609 pop esi |
|
610 pop edi |
|
611 |
|
612 add esi, byte SIZEOF_JSAMPROW ; input_data |
|
613 add edi, byte SIZEOF_JSAMPROW ; output_data |
|
614 dec ecx ; rowctr |
|
615 jg short .rowloop |
|
616 |
|
617 emms ; empty MMX state |
|
618 |
|
619 .return: |
|
620 pop edi |
|
621 pop esi |
|
622 ; pop edx ; need not be preserved |
|
623 ; pop ecx ; need not be preserved |
|
624 ; pop ebx ; unused |
|
625 pop ebp |
|
626 ret |
|
627 |
|
628 ; -------------------------------------------------------------------------- |
|
629 ; |
|
630 ; Fast processing for the common case of 2:1 horizontal and 2:1 vertical. |
|
631 ; It's still a box filter. |
|
632 ; |
|
633 ; GLOBAL(void) |
|
634 ; jsimd_h2v2_upsample_mmx (int max_v_samp_factor, |
|
635 ; JDIMENSION output_width, |
|
636 ; JSAMPARRAY input_data, |
|
637 ; JSAMPARRAY * output_data_ptr); |
|
638 ; |
|
639 |
|
640 %define max_v_samp(b) (b)+8 ; int max_v_samp_factor |
|
641 %define output_width(b) (b)+12 ; JDIMENSION output_width |
|
642 %define input_data(b) (b)+16 ; JSAMPARRAY input_data |
|
643 %define output_data_ptr(b) (b)+20 ; JSAMPARRAY * output_data_ptr |
|
644 |
|
645 align 16 |
|
646 global EXTN(jsimd_h2v2_upsample_mmx) |
|
647 |
|
648 EXTN(jsimd_h2v2_upsample_mmx): |
|
649 push ebp |
|
650 mov ebp,esp |
|
651 push ebx |
|
652 ; push ecx ; need not be preserved |
|
653 ; push edx ; need not be preserved |
|
654 push esi |
|
655 push edi |
|
656 |
|
657 mov edx, JDIMENSION [output_width(ebp)] |
|
658 add edx, byte (2*SIZEOF_MMWORD)-1 |
|
659 and edx, byte -(2*SIZEOF_MMWORD) |
|
660 jz near .return |
|
661 |
|
662 mov ecx, INT [max_v_samp(ebp)] ; rowctr |
|
663 test ecx,ecx |
|
664 jz short .return |
|
665 |
|
666 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data |
|
667 mov edi, POINTER [output_data_ptr(ebp)] |
|
668 mov edi, JSAMPARRAY [edi] ; output_data |
|
669 alignx 16,7 |
|
670 .rowloop: |
|
671 push edi |
|
672 push esi |
|
673 |
|
674 mov esi, JSAMPROW [esi] ; inptr |
|
675 mov ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0 |
|
676 mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1 |
|
677 mov eax,edx ; colctr |
|
678 alignx 16,7 |
|
679 .columnloop: |
|
680 |
|
681 movq mm0, MMWORD [esi+0*SIZEOF_MMWORD] |
|
682 |
|
683 movq mm1,mm0 |
|
684 punpcklbw mm0,mm0 |
|
685 punpckhbw mm1,mm1 |
|
686 |
|
687 movq MMWORD [ebx+0*SIZEOF_MMWORD], mm0 |
|
688 movq MMWORD [ebx+1*SIZEOF_MMWORD], mm1 |
|
689 movq MMWORD [edi+0*SIZEOF_MMWORD], mm0 |
|
690 movq MMWORD [edi+1*SIZEOF_MMWORD], mm1 |
|
691 |
|
692 sub eax, byte 2*SIZEOF_MMWORD |
|
693 jz short .nextrow |
|
694 |
|
695 movq mm2, MMWORD [esi+1*SIZEOF_MMWORD] |
|
696 |
|
697 movq mm3,mm2 |
|
698 punpcklbw mm2,mm2 |
|
699 punpckhbw mm3,mm3 |
|
700 |
|
701 movq MMWORD [ebx+2*SIZEOF_MMWORD], mm2 |
|
702 movq MMWORD [ebx+3*SIZEOF_MMWORD], mm3 |
|
703 movq MMWORD [edi+2*SIZEOF_MMWORD], mm2 |
|
704 movq MMWORD [edi+3*SIZEOF_MMWORD], mm3 |
|
705 |
|
706 sub eax, byte 2*SIZEOF_MMWORD |
|
707 jz short .nextrow |
|
708 |
|
709 add esi, byte 2*SIZEOF_MMWORD ; inptr |
|
710 add ebx, byte 4*SIZEOF_MMWORD ; outptr0 |
|
711 add edi, byte 4*SIZEOF_MMWORD ; outptr1 |
|
712 jmp short .columnloop |
|
713 alignx 16,7 |
|
714 |
|
715 .nextrow: |
|
716 pop esi |
|
717 pop edi |
|
718 |
|
719 add esi, byte 1*SIZEOF_JSAMPROW ; input_data |
|
720 add edi, byte 2*SIZEOF_JSAMPROW ; output_data |
|
721 sub ecx, byte 2 ; rowctr |
|
722 jg short .rowloop |
|
723 |
|
724 emms ; empty MMX state |
|
725 |
|
726 .return: |
|
727 pop edi |
|
728 pop esi |
|
729 ; pop edx ; need not be preserved |
|
730 ; pop ecx ; need not be preserved |
|
731 pop ebx |
|
732 pop ebp |
|
733 ret |
|
734 |
|
735 ; For some reason, the OS X linker does not honor the request to align the |
|
736 ; segment unless we do this. |
|
737 align 16 |