michael@0: #define COMPONENT_SIZE 8
michael@0: #define MASK 0xff
michael@0: #define ONE_HALF 0x80
michael@0: 
michael@0: #define A_SHIFT 8 * 3
michael@0: #define R_SHIFT 8 * 2
michael@0: #define G_SHIFT 8
michael@0: #define A_MASK 0xff000000
michael@0: #define R_MASK 0xff0000
michael@0: #define G_MASK 0xff00
michael@0: 
michael@0: #define RB_MASK 0xff00ff
michael@0: #define AG_MASK 0xff00ff00
michael@0: #define RB_ONE_HALF 0x800080
michael@0: #define RB_MASK_PLUS_ONE 0x10000100
michael@0: 
michael@0: #define ALPHA_8(x) ((x) >> A_SHIFT)
michael@0: #define RED_8(x) (((x) >> R_SHIFT) & MASK)
michael@0: #define GREEN_8(x) (((x) >> G_SHIFT) & MASK)
michael@0: #define BLUE_8(x) ((x) & MASK)
michael@0: 
michael@0: /*
michael@0:  * ARMv6 has UQADD8 instruction, which implements unsigned saturated
michael@0:  * addition for 8-bit values packed in 32-bit registers. It is very useful
michael@0:  * for UN8x4_ADD_UN8x4, UN8_rb_ADD_UN8_rb and ADD_UN8 macros (which would
michael@0:  * otherwise need a lot of arithmetic operations to simulate this operation).
michael@0:  * Since most of the major ARM linux distros are built for ARMv7, we are
michael@0:  * much less dependent on runtime CPU detection and can get practical
michael@0:  * benefits from conditional compilation here for a lot of users.
michael@0:  */
michael@0: 
michael@0: #if defined(USE_GCC_INLINE_ASM) && defined(__arm__) && \
michael@0:     !defined(__aarch64__) && (!defined(__thumb__) || defined(__thumb2__))
michael@0: #if defined(__ARM_ARCH_6__)   || defined(__ARM_ARCH_6J__)  || \
michael@0:     defined(__ARM_ARCH_6K__)  || defined(__ARM_ARCH_6Z__)  || \
michael@0:     defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__) || \
michael@0:     defined(__ARM_ARCH_6M__)  || defined(__ARM_ARCH_7__)   || \
michael@0:     defined(__ARM_ARCH_7A__)  || defined(__ARM_ARCH_7R__)  || \
michael@0:     defined(__ARM_ARCH_7M__)  || defined(__ARM_ARCH_7EM__)
michael@0: 
michael@0: static force_inline uint32_t
michael@0: un8x4_add_un8x4 (uint32_t x, uint32_t y)
michael@0: {
michael@0:     uint32_t t;
michael@0:     asm ("uqadd8 %0, %1, %2" : "=r" (t) : "%r" (x), "r" (y));
michael@0:     return t;
michael@0: }
michael@0: 
michael@0: #define UN8x4_ADD_UN8x4(x, y) \
michael@0:     ((x) = un8x4_add_un8x4 ((x), (y)))
michael@0: 
michael@0: #define UN8_rb_ADD_UN8_rb(x, y, t) \
michael@0:     ((t) = un8x4_add_un8x4 ((x), (y)), (x) = (t))
michael@0: 
michael@0: #define ADD_UN8(x, y, t) \
michael@0:     ((t) = (x), un8x4_add_un8x4 ((t), (y)))
michael@0: 
michael@0: #endif
michael@0: #endif
michael@0: 
michael@0: /*****************************************************************************/
michael@0: 
michael@0: /*
michael@0:  * Helper macros.
michael@0:  */
michael@0: 
michael@0: #define MUL_UN8(a, b, t)						\
michael@0:     ((t) = (a) * (uint16_t)(b) + ONE_HALF, ((((t) >> G_SHIFT ) + (t) ) >> G_SHIFT ))
michael@0: 
michael@0: #define DIV_UN8(a, b)							\
michael@0:     (((uint16_t) (a) * MASK + ((b) / 2)) / (b))
michael@0: 
michael@0: #ifndef ADD_UN8
michael@0: #define ADD_UN8(x, y, t)				     \
michael@0:     ((t) = (x) + (y),					     \
michael@0:      (uint32_t) (uint8_t) ((t) | (0 - ((t) >> G_SHIFT))))
michael@0: #endif
michael@0: 
michael@0: #define DIV_ONE_UN8(x)							\
michael@0:     (((x) + ONE_HALF + (((x) + ONE_HALF) >> G_SHIFT)) >> G_SHIFT)
michael@0: 
michael@0: /*
michael@0:  * The methods below use some tricks to be able to do two color
michael@0:  * components at the same time.
michael@0:  */
michael@0: 
michael@0: /*
michael@0:  * x_rb = (x_rb * a) / 255
michael@0:  */
michael@0: #define UN8_rb_MUL_UN8(x, a, t)						\
michael@0:     do									\
michael@0:     {									\
michael@0: 	t  = ((x) & RB_MASK) * (a);					\
michael@0: 	t += RB_ONE_HALF;						\
michael@0: 	x = (t + ((t >> G_SHIFT) & RB_MASK)) >> G_SHIFT;		\
michael@0: 	x &= RB_MASK;							\
michael@0:     } while (0)
michael@0: 
michael@0: /*
michael@0:  * x_rb = min (x_rb + y_rb, 255)
michael@0:  */
michael@0: #ifndef UN8_rb_ADD_UN8_rb
michael@0: #define UN8_rb_ADD_UN8_rb(x, y, t)					\
michael@0:     do									\
michael@0:     {									\
michael@0: 	t = ((x) + (y));						\
michael@0: 	t |= RB_MASK_PLUS_ONE - ((t >> G_SHIFT) & RB_MASK);		\
michael@0: 	x = (t & RB_MASK);						\
michael@0:     } while (0)
michael@0: #endif
michael@0: 
michael@0: /*
michael@0:  * x_rb = (x_rb * a_rb) / 255
michael@0:  */
michael@0: #define UN8_rb_MUL_UN8_rb(x, a, t)					\
michael@0:     do									\
michael@0:     {									\
michael@0: 	t  = (x & MASK) * (a & MASK);					\
michael@0: 	t |= (x & R_MASK) * ((a >> R_SHIFT) & MASK);			\
michael@0: 	t += RB_ONE_HALF;						\
michael@0: 	t = (t + ((t >> G_SHIFT) & RB_MASK)) >> G_SHIFT;		\
michael@0: 	x = t & RB_MASK;						\
michael@0:     } while (0)
michael@0: 
michael@0: /*
michael@0:  * x_c = (x_c * a) / 255
michael@0:  */
michael@0: #define UN8x4_MUL_UN8(x, a)						\
michael@0:     do									\
michael@0:     {									\
michael@0: 	uint32_t r1__, r2__, t__;					\
michael@0: 									\
michael@0: 	r1__ = (x);							\
michael@0: 	UN8_rb_MUL_UN8 (r1__, (a), t__);				\
michael@0: 									\
michael@0: 	r2__ = (x) >> G_SHIFT;						\
michael@0: 	UN8_rb_MUL_UN8 (r2__, (a), t__);				\
michael@0: 									\
michael@0: 	(x) = r1__ | (r2__ << G_SHIFT);					\
michael@0:     } while (0)
michael@0: 
michael@0: /*
michael@0:  * x_c = (x_c * a) / 255 + y_c
michael@0:  */
michael@0: #define UN8x4_MUL_UN8_ADD_UN8x4(x, a, y)				\
michael@0:     do									\
michael@0:     {									\
michael@0: 	uint32_t r1__, r2__, r3__, t__;					\
michael@0: 									\
michael@0: 	r1__ = (x);							\
michael@0: 	r2__ = (y) & RB_MASK;						\
michael@0: 	UN8_rb_MUL_UN8 (r1__, (a), t__);				\
michael@0: 	UN8_rb_ADD_UN8_rb (r1__, r2__, t__);				\
michael@0: 									\
michael@0: 	r2__ = (x) >> G_SHIFT;						\
michael@0: 	r3__ = ((y) >> G_SHIFT) & RB_MASK;				\
michael@0: 	UN8_rb_MUL_UN8 (r2__, (a), t__);				\
michael@0: 	UN8_rb_ADD_UN8_rb (r2__, r3__, t__);				\
michael@0: 									\
michael@0: 	(x) = r1__ | (r2__ << G_SHIFT);					\
michael@0:     } while (0)
michael@0: 
michael@0: /*
michael@0:  * x_c = (x_c * a + y_c * b) / 255
michael@0:  */
michael@0: #define UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8(x, a, y, b)			\
michael@0:     do									\
michael@0:     {									\
michael@0: 	uint32_t r1__, r2__, r3__, t__;					\
michael@0: 									\
michael@0: 	r1__ = (x);							\
michael@0: 	r2__ = (y);							\
michael@0: 	UN8_rb_MUL_UN8 (r1__, (a), t__);				\
michael@0: 	UN8_rb_MUL_UN8 (r2__, (b), t__);				\
michael@0: 	UN8_rb_ADD_UN8_rb (r1__, r2__, t__);				\
michael@0: 									\
michael@0: 	r2__ = ((x) >> G_SHIFT);					\
michael@0: 	r3__ = ((y) >> G_SHIFT);					\
michael@0: 	UN8_rb_MUL_UN8 (r2__, (a), t__);				\
michael@0: 	UN8_rb_MUL_UN8 (r3__, (b), t__);				\
michael@0: 	UN8_rb_ADD_UN8_rb (r2__, r3__, t__);				\
michael@0: 									\
michael@0: 	(x) = r1__ | (r2__ << G_SHIFT);					\
michael@0:     } while (0)
michael@0: 
michael@0: /*
michael@0:  * x_c = (x_c * a_c) / 255
michael@0:  */
michael@0: #define UN8x4_MUL_UN8x4(x, a)						\
michael@0:     do									\
michael@0:     {									\
michael@0: 	uint32_t r1__, r2__, r3__, t__;					\
michael@0: 									\
michael@0: 	r1__ = (x);							\
michael@0: 	r2__ = (a);							\
michael@0: 	UN8_rb_MUL_UN8_rb (r1__, r2__, t__);				\
michael@0: 									\
michael@0: 	r2__ = (x) >> G_SHIFT;						\
michael@0: 	r3__ = (a) >> G_SHIFT;						\
michael@0: 	UN8_rb_MUL_UN8_rb (r2__, r3__, t__);				\
michael@0: 									\
michael@0: 	(x) = r1__ | (r2__ << G_SHIFT);					\
michael@0:     } while (0)
michael@0: 
michael@0: /*
michael@0:  * x_c = (x_c * a_c) / 255 + y_c
michael@0:  */
michael@0: #define UN8x4_MUL_UN8x4_ADD_UN8x4(x, a, y)				\
michael@0:     do									\
michael@0:     {									\
michael@0: 	uint32_t r1__, r2__, r3__, t__;					\
michael@0: 									\
michael@0: 	r1__ = (x);							\
michael@0: 	r2__ = (a);							\
michael@0: 	UN8_rb_MUL_UN8_rb (r1__, r2__, t__);				\
michael@0: 	r2__ = (y) & RB_MASK;						\
michael@0: 	UN8_rb_ADD_UN8_rb (r1__, r2__, t__);				\
michael@0: 									\
michael@0: 	r2__ = ((x) >> G_SHIFT);					\
michael@0: 	r3__ = ((a) >> G_SHIFT);					\
michael@0: 	UN8_rb_MUL_UN8_rb (r2__, r3__, t__);				\
michael@0: 	r3__ = ((y) >> G_SHIFT) & RB_MASK;				\
michael@0: 	UN8_rb_ADD_UN8_rb (r2__, r3__, t__);				\
michael@0: 									\
michael@0: 	(x) = r1__ | (r2__ << G_SHIFT);					\
michael@0:     } while (0)
michael@0: 
michael@0: /*
michael@0:  * x_c = (x_c * a_c + y_c * b) / 255
michael@0:  */
michael@0: #define UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN8(x, a, y, b)			\
michael@0:     do									\
michael@0:     {									\
michael@0: 	uint32_t r1__, r2__, r3__, t__;					\
michael@0: 									\
michael@0: 	r1__ = (x);							\
michael@0: 	r2__ = (a);							\
michael@0: 	UN8_rb_MUL_UN8_rb (r1__, r2__, t__);				\
michael@0: 	r2__ = (y);							\
michael@0: 	UN8_rb_MUL_UN8 (r2__, (b), t__);				\
michael@0: 	UN8_rb_ADD_UN8_rb (r1__, r2__, t__);				\
michael@0: 									\
michael@0: 	r2__ = (x) >> G_SHIFT;						\
michael@0: 	r3__ = (a) >> G_SHIFT;						\
michael@0: 	UN8_rb_MUL_UN8_rb (r2__, r3__, t__);				\
michael@0: 	r3__ = (y) >> G_SHIFT;						\
michael@0: 	UN8_rb_MUL_UN8 (r3__, (b), t__);				\
michael@0: 	UN8_rb_ADD_UN8_rb (r2__, r3__, t__);				\
michael@0: 									\
michael@0: 	x = r1__ | (r2__ << G_SHIFT);					\
michael@0:     } while (0)
michael@0: 
michael@0: /*
michael@0:   x_c = min(x_c + y_c, 255)
michael@0: */
michael@0: #ifndef UN8x4_ADD_UN8x4
michael@0: #define UN8x4_ADD_UN8x4(x, y)						\
michael@0:     do									\
michael@0:     {									\
michael@0: 	uint32_t r1__, r2__, r3__, t__;					\
michael@0: 									\
michael@0: 	r1__ = (x) & RB_MASK;						\
michael@0: 	r2__ = (y) & RB_MASK;						\
michael@0: 	UN8_rb_ADD_UN8_rb (r1__, r2__, t__);				\
michael@0: 									\
michael@0: 	r2__ = ((x) >> G_SHIFT) & RB_MASK;				\
michael@0: 	r3__ = ((y) >> G_SHIFT) & RB_MASK;				\
michael@0: 	UN8_rb_ADD_UN8_rb (r2__, r3__, t__);				\
michael@0: 									\
michael@0: 	x = r1__ | (r2__ << G_SHIFT);					\
michael@0:     } while (0)
michael@0: #endif