|
1 /* |
|
2 * Copyright 2012 The Android Open Source Project |
|
3 * |
|
4 * Use of this source code is governed by a BSD-style license that can be |
|
5 * found in the LICENSE file. |
|
6 */ |
|
7 |
|
8 #include "SkBlitRow.h" |
|
9 #include "SkColorPriv.h" |
|
10 #include "SkDither.h" |
|
11 #include "SkMathPriv.h" |
|
12 #include "SkUtils.h" |
|
13 #include "SkUtilsArm.h" |
|
14 |
|
15 #include "SkCachePreload_arm.h" |
|
16 |
|
17 // Define USE_NEON_CODE to indicate that we need to build NEON routines |
|
18 #define USE_NEON_CODE (!SK_ARM_NEON_IS_NONE) |
|
19 |
|
20 // Define USE_ARM_CODE to indicate that we need to build ARM routines |
|
21 #define USE_ARM_CODE (!SK_ARM_NEON_IS_ALWAYS) |
|
22 |
|
23 #if USE_NEON_CODE |
|
24 #include "SkBlitRow_opts_arm_neon.h" |
|
25 #endif |
|
26 |
|
27 #if USE_ARM_CODE |
|
28 |
|
29 static void S32A_D565_Opaque(uint16_t* SK_RESTRICT dst, |
|
30 const SkPMColor* SK_RESTRICT src, int count, |
|
31 U8CPU alpha, int /*x*/, int /*y*/) { |
|
32 SkASSERT(255 == alpha); |
|
33 |
|
34 asm volatile ( |
|
35 "1: \n\t" |
|
36 "ldr r3, [%[src]], #4 \n\t" |
|
37 "cmp r3, #0xff000000 \n\t" |
|
38 "blo 2f \n\t" |
|
39 "and r4, r3, #0x0000f8 \n\t" |
|
40 "and r5, r3, #0x00fc00 \n\t" |
|
41 "and r6, r3, #0xf80000 \n\t" |
|
42 #ifdef SK_ARM_HAS_EDSP |
|
43 "pld [r1, #32] \n\t" |
|
44 #endif |
|
45 "lsl r3, r4, #8 \n\t" |
|
46 "orr r3, r3, r5, lsr #5 \n\t" |
|
47 "orr r3, r3, r6, lsr #19 \n\t" |
|
48 "subs %[count], %[count], #1 \n\t" |
|
49 "strh r3, [%[dst]], #2 \n\t" |
|
50 "bne 1b \n\t" |
|
51 "b 4f \n\t" |
|
52 "2: \n\t" |
|
53 "lsrs r7, r3, #24 \n\t" |
|
54 "beq 3f \n\t" |
|
55 "ldrh r4, [%[dst]] \n\t" |
|
56 "rsb r7, r7, #255 \n\t" |
|
57 "and r6, r4, #0x001f \n\t" |
|
58 #if SK_ARM_ARCH <= 6 |
|
59 "lsl r5, r4, #21 \n\t" |
|
60 "lsr r5, r5, #26 \n\t" |
|
61 #else |
|
62 "ubfx r5, r4, #5, #6 \n\t" |
|
63 #endif |
|
64 #ifdef SK_ARM_HAS_EDSP |
|
65 "pld [r0, #16] \n\t" |
|
66 #endif |
|
67 "lsr r4, r4, #11 \n\t" |
|
68 #ifdef SK_ARM_HAS_EDSP |
|
69 "smulbb r6, r6, r7 \n\t" |
|
70 "smulbb r5, r5, r7 \n\t" |
|
71 "smulbb r4, r4, r7 \n\t" |
|
72 #else |
|
73 "mul r6, r6, r7 \n\t" |
|
74 "mul r5, r5, r7 \n\t" |
|
75 "mul r4, r4, r7 \n\t" |
|
76 #endif |
|
77 #if SK_ARM_ARCH >= 6 |
|
78 "uxtb r7, r3, ROR #16 \n\t" |
|
79 "uxtb ip, r3, ROR #8 \n\t" |
|
80 #else |
|
81 "mov ip, #0xff \n\t" |
|
82 "and r7, ip, r3, ROR #16 \n\t" |
|
83 "and ip, ip, r3, ROR #8 \n\t" |
|
84 #endif |
|
85 "and r3, r3, #0xff \n\t" |
|
86 "add r6, r6, #16 \n\t" |
|
87 "add r5, r5, #32 \n\t" |
|
88 "add r4, r4, #16 \n\t" |
|
89 "add r6, r6, r6, lsr #5 \n\t" |
|
90 "add r5, r5, r5, lsr #6 \n\t" |
|
91 "add r4, r4, r4, lsr #5 \n\t" |
|
92 "add r6, r7, r6, lsr #5 \n\t" |
|
93 "add r5, ip, r5, lsr #6 \n\t" |
|
94 "add r4, r3, r4, lsr #5 \n\t" |
|
95 "lsr r6, r6, #3 \n\t" |
|
96 "and r5, r5, #0xfc \n\t" |
|
97 "and r4, r4, #0xf8 \n\t" |
|
98 "orr r6, r6, r5, lsl #3 \n\t" |
|
99 "orr r4, r6, r4, lsl #8 \n\t" |
|
100 "strh r4, [%[dst]], #2 \n\t" |
|
101 #ifdef SK_ARM_HAS_EDSP |
|
102 "pld [r1, #32] \n\t" |
|
103 #endif |
|
104 "subs %[count], %[count], #1 \n\t" |
|
105 "bne 1b \n\t" |
|
106 "b 4f \n\t" |
|
107 "3: \n\t" |
|
108 "subs %[count], %[count], #1 \n\t" |
|
109 "add %[dst], %[dst], #2 \n\t" |
|
110 "bne 1b \n\t" |
|
111 "4: \n\t" |
|
112 : [dst] "+r" (dst), [src] "+r" (src), [count] "+r" (count) |
|
113 : |
|
114 : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "ip" |
|
115 ); |
|
116 } |
|
117 |
|
118 static void S32A_Opaque_BlitRow32_arm(SkPMColor* SK_RESTRICT dst, |
|
119 const SkPMColor* SK_RESTRICT src, |
|
120 int count, U8CPU alpha) { |
|
121 |
|
122 SkASSERT(255 == alpha); |
|
123 |
|
124 asm volatile ( |
|
125 "cmp %[count], #0 \n\t" /* comparing count with 0 */ |
|
126 "beq 3f \n\t" /* if zero exit */ |
|
127 |
|
128 "mov ip, #0xff \n\t" /* load the 0xff mask in ip */ |
|
129 "orr ip, ip, ip, lsl #16 \n\t" /* convert it to 0xff00ff in ip */ |
|
130 |
|
131 "cmp %[count], #2 \n\t" /* compare count with 2 */ |
|
132 "blt 2f \n\t" /* if less than 2 -> single loop */ |
|
133 |
|
134 /* Double Loop */ |
|
135 "1: \n\t" /* <double loop> */ |
|
136 "ldm %[src]!, {r5,r6} \n\t" /* load the src(s) at r5-r6 */ |
|
137 "ldm %[dst], {r7,r8} \n\t" /* loading dst(s) into r7-r8 */ |
|
138 "lsr r4, r5, #24 \n\t" /* extracting the alpha from source and storing it to r4 */ |
|
139 |
|
140 /* ----------- */ |
|
141 "and r9, ip, r7 \n\t" /* r9 = br masked by ip */ |
|
142 "rsb r4, r4, #256 \n\t" /* subtracting the alpha from 256 -> r4=scale */ |
|
143 "and r10, ip, r7, lsr #8 \n\t" /* r10 = ag masked by ip */ |
|
144 |
|
145 "mul r9, r9, r4 \n\t" /* br = br * scale */ |
|
146 "mul r10, r10, r4 \n\t" /* ag = ag * scale */ |
|
147 "and r9, ip, r9, lsr #8 \n\t" /* lsr br by 8 and mask it */ |
|
148 |
|
149 "and r10, r10, ip, lsl #8 \n\t" /* mask ag with reverse mask */ |
|
150 "lsr r4, r6, #24 \n\t" /* extracting the alpha from source and storing it to r4 */ |
|
151 "orr r7, r9, r10 \n\t" /* br | ag*/ |
|
152 |
|
153 "add r7, r5, r7 \n\t" /* dst = src + calc dest(r7) */ |
|
154 "rsb r4, r4, #256 \n\t" /* subtracting the alpha from 255 -> r4=scale */ |
|
155 |
|
156 /* ----------- */ |
|
157 "and r9, ip, r8 \n\t" /* r9 = br masked by ip */ |
|
158 |
|
159 "and r10, ip, r8, lsr #8 \n\t" /* r10 = ag masked by ip */ |
|
160 "mul r9, r9, r4 \n\t" /* br = br * scale */ |
|
161 "sub %[count], %[count], #2 \n\t" |
|
162 "mul r10, r10, r4 \n\t" /* ag = ag * scale */ |
|
163 |
|
164 "and r9, ip, r9, lsr #8 \n\t" /* lsr br by 8 and mask it */ |
|
165 "and r10, r10, ip, lsl #8 \n\t" /* mask ag with reverse mask */ |
|
166 "cmp %[count], #1 \n\t" /* comparing count with 1 */ |
|
167 "orr r8, r9, r10 \n\t" /* br | ag */ |
|
168 |
|
169 "add r8, r6, r8 \n\t" /* dst = src + calc dest(r8) */ |
|
170 |
|
171 /* ----------------- */ |
|
172 "stm %[dst]!, {r7,r8} \n\t" /* *dst = r7, increment dst by two (each times 4) */ |
|
173 /* ----------------- */ |
|
174 |
|
175 "bgt 1b \n\t" /* if greater than 1 -> reloop */ |
|
176 "blt 3f \n\t" /* if less than 1 -> exit */ |
|
177 |
|
178 /* Single Loop */ |
|
179 "2: \n\t" /* <single loop> */ |
|
180 "ldr r5, [%[src]], #4 \n\t" /* load the src pointer into r5 r5=src */ |
|
181 "ldr r7, [%[dst]] \n\t" /* loading dst into r7 */ |
|
182 "lsr r4, r5, #24 \n\t" /* extracting the alpha from source and storing it to r4 */ |
|
183 |
|
184 /* ----------- */ |
|
185 "and r9, ip, r7 \n\t" /* r9 = br masked by ip */ |
|
186 "rsb r4, r4, #256 \n\t" /* subtracting the alpha from 256 -> r4=scale */ |
|
187 |
|
188 "and r10, ip, r7, lsr #8 \n\t" /* r10 = ag masked by ip */ |
|
189 "mul r9, r9, r4 \n\t" /* br = br * scale */ |
|
190 "mul r10, r10, r4 \n\t" /* ag = ag * scale */ |
|
191 "and r9, ip, r9, lsr #8 \n\t" /* lsr br by 8 and mask it */ |
|
192 |
|
193 "and r10, r10, ip, lsl #8 \n\t" /* mask ag */ |
|
194 "orr r7, r9, r10 \n\t" /* br | ag */ |
|
195 |
|
196 "add r7, r5, r7 \n\t" /* *dst = src + calc dest(r7) */ |
|
197 |
|
198 /* ----------------- */ |
|
199 "str r7, [%[dst]], #4 \n\t" /* *dst = r7, increment dst by one (times 4) */ |
|
200 /* ----------------- */ |
|
201 |
|
202 "3: \n\t" /* <exit> */ |
|
203 : [dst] "+r" (dst), [src] "+r" (src), [count] "+r" (count) |
|
204 : |
|
205 : "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "ip", "memory" |
|
206 ); |
|
207 } |
|
208 |
|
209 /* |
|
210 * ARM asm version of S32A_Blend_BlitRow32 |
|
211 */ |
|
212 void S32A_Blend_BlitRow32_arm(SkPMColor* SK_RESTRICT dst, |
|
213 const SkPMColor* SK_RESTRICT src, |
|
214 int count, U8CPU alpha) { |
|
215 asm volatile ( |
|
216 "cmp %[count], #0 \n\t" /* comparing count with 0 */ |
|
217 "beq 3f \n\t" /* if zero exit */ |
|
218 |
|
219 "mov r12, #0xff \n\t" /* load the 0xff mask in r12 */ |
|
220 "orr r12, r12, r12, lsl #16 \n\t" /* convert it to 0xff00ff in r12 */ |
|
221 |
|
222 /* src1,2_scale */ |
|
223 "add %[alpha], %[alpha], #1 \n\t" /* loading %[alpha]=src_scale=alpha+1 */ |
|
224 |
|
225 "cmp %[count], #2 \n\t" /* comparing count with 2 */ |
|
226 "blt 2f \n\t" /* if less than 2 -> single loop */ |
|
227 |
|
228 /* Double Loop */ |
|
229 "1: \n\t" /* <double loop> */ |
|
230 "ldm %[src]!, {r5, r6} \n\t" /* loading src pointers into r5 and r6 */ |
|
231 "ldm %[dst], {r7, r8} \n\t" /* loading dst pointers into r7 and r8 */ |
|
232 |
|
233 /* dst1_scale and dst2_scale*/ |
|
234 "lsr r9, r5, #24 \n\t" /* src >> 24 */ |
|
235 "lsr r10, r6, #24 \n\t" /* src >> 24 */ |
|
236 #ifdef SK_ARM_HAS_EDSP |
|
237 "smulbb r9, r9, %[alpha] \n\t" /* r9 = SkMulS16 r9 with src_scale */ |
|
238 "smulbb r10, r10, %[alpha] \n\t" /* r10 = SkMulS16 r10 with src_scale */ |
|
239 #else |
|
240 "mul r9, r9, %[alpha] \n\t" /* r9 = SkMulS16 r9 with src_scale */ |
|
241 "mul r10, r10, %[alpha] \n\t" /* r10 = SkMulS16 r10 with src_scale */ |
|
242 #endif |
|
243 "lsr r9, r9, #8 \n\t" /* r9 >> 8 */ |
|
244 "lsr r10, r10, #8 \n\t" /* r10 >> 8 */ |
|
245 "rsb r9, r9, #256 \n\t" /* dst1_scale = r9 = 255 - r9 + 1 */ |
|
246 "rsb r10, r10, #256 \n\t" /* dst2_scale = r10 = 255 - r10 + 1 */ |
|
247 |
|
248 /* ---------------------- */ |
|
249 |
|
250 /* src1, src1_scale */ |
|
251 "and r11, r12, r5, lsr #8 \n\t" /* ag = r11 = r5 masked by r12 lsr by #8 */ |
|
252 "and r4, r12, r5 \n\t" /* rb = r4 = r5 masked by r12 */ |
|
253 "mul r11, r11, %[alpha] \n\t" /* ag = r11 times src_scale */ |
|
254 "mul r4, r4, %[alpha] \n\t" /* rb = r4 times src_scale */ |
|
255 "and r11, r11, r12, lsl #8 \n\t" /* ag masked by reverse mask (r12) */ |
|
256 "and r4, r12, r4, lsr #8 \n\t" /* rb masked by mask (r12) */ |
|
257 "orr r5, r11, r4 \n\t" /* r5 = (src1, src_scale) */ |
|
258 |
|
259 /* dst1, dst1_scale */ |
|
260 "and r11, r12, r7, lsr #8 \n\t" /* ag = r11 = r7 masked by r12 lsr by #8 */ |
|
261 "and r4, r12, r7 \n\t" /* rb = r4 = r7 masked by r12 */ |
|
262 "mul r11, r11, r9 \n\t" /* ag = r11 times dst_scale (r9) */ |
|
263 "mul r4, r4, r9 \n\t" /* rb = r4 times dst_scale (r9) */ |
|
264 "and r11, r11, r12, lsl #8 \n\t" /* ag masked by reverse mask (r12) */ |
|
265 "and r4, r12, r4, lsr #8 \n\t" /* rb masked by mask (r12) */ |
|
266 "orr r9, r11, r4 \n\t" /* r9 = (dst1, dst_scale) */ |
|
267 |
|
268 /* ---------------------- */ |
|
269 "add r9, r5, r9 \n\t" /* *dst = src plus dst both scaled */ |
|
270 /* ---------------------- */ |
|
271 |
|
272 /* ====================== */ |
|
273 |
|
274 /* src2, src2_scale */ |
|
275 "and r11, r12, r6, lsr #8 \n\t" /* ag = r11 = r6 masked by r12 lsr by #8 */ |
|
276 "and r4, r12, r6 \n\t" /* rb = r4 = r6 masked by r12 */ |
|
277 "mul r11, r11, %[alpha] \n\t" /* ag = r11 times src_scale */ |
|
278 "mul r4, r4, %[alpha] \n\t" /* rb = r4 times src_scale */ |
|
279 "and r11, r11, r12, lsl #8 \n\t" /* ag masked by reverse mask (r12) */ |
|
280 "and r4, r12, r4, lsr #8 \n\t" /* rb masked by mask (r12) */ |
|
281 "orr r6, r11, r4 \n\t" /* r6 = (src2, src_scale) */ |
|
282 |
|
283 /* dst2, dst2_scale */ |
|
284 "and r11, r12, r8, lsr #8 \n\t" /* ag = r11 = r8 masked by r12 lsr by #8 */ |
|
285 "and r4, r12, r8 \n\t" /* rb = r4 = r8 masked by r12 */ |
|
286 "mul r11, r11, r10 \n\t" /* ag = r11 times dst_scale (r10) */ |
|
287 "mul r4, r4, r10 \n\t" /* rb = r4 times dst_scale (r6) */ |
|
288 "and r11, r11, r12, lsl #8 \n\t" /* ag masked by reverse mask (r12) */ |
|
289 "and r4, r12, r4, lsr #8 \n\t" /* rb masked by mask (r12) */ |
|
290 "orr r10, r11, r4 \n\t" /* r10 = (dst2, dst_scale) */ |
|
291 |
|
292 "sub %[count], %[count], #2 \n\t" /* decrease count by 2 */ |
|
293 /* ---------------------- */ |
|
294 "add r10, r6, r10 \n\t" /* *dst = src plus dst both scaled */ |
|
295 /* ---------------------- */ |
|
296 "cmp %[count], #1 \n\t" /* compare count with 1 */ |
|
297 /* ----------------- */ |
|
298 "stm %[dst]!, {r9, r10} \n\t" /* copy r9 and r10 to r7 and r8 respectively */ |
|
299 /* ----------------- */ |
|
300 |
|
301 "bgt 1b \n\t" /* if %[count] greater than 1 reloop */ |
|
302 "blt 3f \n\t" /* if %[count] less than 1 exit */ |
|
303 /* else get into the single loop */ |
|
304 /* Single Loop */ |
|
305 "2: \n\t" /* <single loop> */ |
|
306 "ldr r5, [%[src]], #4 \n\t" /* loading src pointer into r5: r5=src */ |
|
307 "ldr r7, [%[dst]] \n\t" /* loading dst pointer into r7: r7=dst */ |
|
308 |
|
309 "lsr r6, r5, #24 \n\t" /* src >> 24 */ |
|
310 "and r8, r12, r5, lsr #8 \n\t" /* ag = r8 = r5 masked by r12 lsr by #8 */ |
|
311 #ifdef SK_ARM_HAS_EDSP |
|
312 "smulbb r6, r6, %[alpha] \n\t" /* r6 = SkMulS16 with src_scale */ |
|
313 #else |
|
314 "mul r6, r6, %[alpha] \n\t" /* r6 = SkMulS16 with src_scale */ |
|
315 #endif |
|
316 "and r9, r12, r5 \n\t" /* rb = r9 = r5 masked by r12 */ |
|
317 "lsr r6, r6, #8 \n\t" /* r6 >> 8 */ |
|
318 "mul r8, r8, %[alpha] \n\t" /* ag = r8 times scale */ |
|
319 "rsb r6, r6, #256 \n\t" /* r6 = 255 - r6 + 1 */ |
|
320 |
|
321 /* src, src_scale */ |
|
322 "mul r9, r9, %[alpha] \n\t" /* rb = r9 times scale */ |
|
323 "and r8, r8, r12, lsl #8 \n\t" /* ag masked by reverse mask (r12) */ |
|
324 "and r9, r12, r9, lsr #8 \n\t" /* rb masked by mask (r12) */ |
|
325 "orr r10, r8, r9 \n\t" /* r10 = (scr, src_scale) */ |
|
326 |
|
327 /* dst, dst_scale */ |
|
328 "and r8, r12, r7, lsr #8 \n\t" /* ag = r8 = r7 masked by r12 lsr by #8 */ |
|
329 "and r9, r12, r7 \n\t" /* rb = r9 = r7 masked by r12 */ |
|
330 "mul r8, r8, r6 \n\t" /* ag = r8 times scale (r6) */ |
|
331 "mul r9, r9, r6 \n\t" /* rb = r9 times scale (r6) */ |
|
332 "and r8, r8, r12, lsl #8 \n\t" /* ag masked by reverse mask (r12) */ |
|
333 "and r9, r12, r9, lsr #8 \n\t" /* rb masked by mask (r12) */ |
|
334 "orr r7, r8, r9 \n\t" /* r7 = (dst, dst_scale) */ |
|
335 |
|
336 "add r10, r7, r10 \n\t" /* *dst = src plus dst both scaled */ |
|
337 |
|
338 /* ----------------- */ |
|
339 "str r10, [%[dst]], #4 \n\t" /* *dst = r10, postincrement dst by one (times 4) */ |
|
340 /* ----------------- */ |
|
341 |
|
342 "3: \n\t" /* <exit> */ |
|
343 : [dst] "+r" (dst), [src] "+r" (src), [count] "+r" (count), [alpha] "+r" (alpha) |
|
344 : |
|
345 : "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "memory" |
|
346 ); |
|
347 |
|
348 } |
|
349 |
|
350 /////////////////////////////////////////////////////////////////////////////// |
|
351 |
|
352 static const SkBlitRow::Proc sk_blitrow_platform_565_procs_arm[] = { |
|
353 // no dither |
|
354 // NOTE: For the functions below, we don't have a special version |
|
355 // that assumes that each source pixel is opaque. But our S32A is |
|
356 // still faster than the default, so use it. |
|
357 S32A_D565_Opaque, // S32_D565_Opaque |
|
358 NULL, // S32_D565_Blend |
|
359 S32A_D565_Opaque, // S32A_D565_Opaque |
|
360 NULL, // S32A_D565_Blend |
|
361 |
|
362 // dither |
|
363 NULL, // S32_D565_Opaque_Dither |
|
364 NULL, // S32_D565_Blend_Dither |
|
365 NULL, // S32A_D565_Opaque_Dither |
|
366 NULL, // S32A_D565_Blend_Dither |
|
367 }; |
|
368 |
|
369 static const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm[] = { |
|
370 NULL, // S32_Opaque, |
|
371 NULL, // S32_Blend, |
|
372 S32A_Opaque_BlitRow32_arm, // S32A_Opaque, |
|
373 S32A_Blend_BlitRow32_arm // S32A_Blend |
|
374 }; |
|
375 |
|
376 #endif // USE_ARM_CODE |
|
377 |
|
378 SkBlitRow::Proc SkBlitRow::PlatformProcs565(unsigned flags) { |
|
379 return SK_ARM_NEON_WRAP(sk_blitrow_platform_565_procs_arm)[flags]; |
|
380 } |
|
381 |
|
382 SkBlitRow::Proc32 SkBlitRow::PlatformProcs32(unsigned flags) { |
|
383 return SK_ARM_NEON_WRAP(sk_blitrow_platform_32_procs_arm)[flags]; |
|
384 } |
|
385 |
|
386 /////////////////////////////////////////////////////////////////////////////// |
|
387 #define Color32_arm NULL |
|
388 SkBlitRow::ColorProc SkBlitRow::PlatformColorProc() { |
|
389 return SK_ARM_NEON_WRAP(Color32_arm); |
|
390 } |
|
391 |
|
392 SkBlitRow::ColorRectProc PlatformColorRectProcFactory() { |
|
393 return NULL; |
|
394 } |