media/libtheora/lib/arm/armidct.s

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/media/libtheora/lib/arm/armidct.s	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,1914 @@
     1.4 +;********************************************************************
     1.5 +;*                                                                  *
     1.6 +;* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
     1.7 +;* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
     1.8 +;* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
     1.9 +;* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
    1.10 +;*                                                                  *
    1.11 +;* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010                *
    1.12 +;* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
    1.13 +;*                                                                  *
    1.14 +;********************************************************************
    1.15 +; Original implementation:
    1.16 +;  Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd
    1.17 +; last mod: $Id: armidct.s 17481 2010-10-03 22:49:42Z tterribe $
    1.18 +;********************************************************************
    1.19 +
    1.20 +	AREA	|.text|, CODE, READONLY
    1.21 +
    1.22 +	; Explicitly specifying alignment here because some versions of
    1.23 +	; gas don't align code correctly. See
    1.24 +	; http://lists.gnu.org/archive/html/bug-binutils/2011-06/msg00199.html
    1.25 +	; https://bugzilla.mozilla.org/show_bug.cgi?id=920992
    1.26 +	ALIGN
    1.27 +
    1.28 +	GET	armopts.s
    1.29 +
    1.30 +	EXPORT	oc_idct8x8_1_arm
    1.31 +	EXPORT	oc_idct8x8_arm
    1.32 +
    1.33 +oc_idct8x8_1_arm PROC
    1.34 +	; r0 = ogg_int16_t  *_y
    1.35 +	; r1 = ogg_uint16_t  _dc
    1.36 +	ORR	r1, r1, r1, LSL #16
    1.37 +	MOV	r2, r1
    1.38 +	MOV	r3, r1
    1.39 +	MOV	r12,r1
    1.40 +	STMIA	r0!,{r1,r2,r3,r12}
    1.41 +	STMIA	r0!,{r1,r2,r3,r12}
    1.42 +	STMIA	r0!,{r1,r2,r3,r12}
    1.43 +	STMIA	r0!,{r1,r2,r3,r12}
    1.44 +	STMIA	r0!,{r1,r2,r3,r12}
    1.45 +	STMIA	r0!,{r1,r2,r3,r12}
    1.46 +	STMIA	r0!,{r1,r2,r3,r12}
    1.47 +	STMIA	r0!,{r1,r2,r3,r12}
    1.48 +	MOV	PC, r14
    1.49 +	ENDP
    1.50 +
    1.51 +oc_idct8x8_arm PROC
    1.52 +	; r0 = ogg_int16_t *_y
    1.53 +	; r1 = ogg_int16_t *_x
    1.54 +	; r2 = int          _last_zzi
    1.55 +	CMP	r2, #3
    1.56 +	BLE	oc_idct8x8_3_arm
    1.57 +	CMP	r2, #6
    1.58 +	BLE	oc_idct8x8_6_arm
    1.59 +	CMP	r2, #10
    1.60 +	BLE	oc_idct8x8_10_arm
    1.61 +oc_idct8x8_slow_arm
    1.62 +	STMFD	r13!,{r4-r11,r14}
    1.63 +	SUB	r13,r13,#64*2
    1.64 +; Row transforms
    1.65 +	STR	r0, [r13,#-4]!
    1.66 +	ADD	r0, r13, #4	; Write to temp storage.
    1.67 +	BL	idct8core_arm
    1.68 +	BL	idct8core_arm
    1.69 +	BL	idct8core_arm
    1.70 +	BL	idct8core_arm
    1.71 +	BL	idct8core_arm
    1.72 +	BL	idct8core_arm
    1.73 +	BL	idct8core_arm
    1.74 +	BL	idct8core_arm
    1.75 +	LDR	r0, [r13], #4	; Write to the final destination.
    1.76 +	; Clear input data for next block (decoder only).
    1.77 +	SUB	r2, r1, #8*16
    1.78 +	CMP	r0, r2
    1.79 +	MOV	r1, r13		; And read from temp storage.
    1.80 +	BEQ	oc_idct8x8_slow_arm_cols
    1.81 +	MOV	r4, #0
    1.82 +	MOV	r5, #0
    1.83 +	MOV	r6, #0
    1.84 +	MOV	r7, #0
    1.85 +	STMIA	r2!,{r4,r5,r6,r7}
    1.86 +	STMIA	r2!,{r4,r5,r6,r7}
    1.87 +	STMIA	r2!,{r4,r5,r6,r7}
    1.88 +	STMIA	r2!,{r4,r5,r6,r7}
    1.89 +	STMIA	r2!,{r4,r5,r6,r7}
    1.90 +	STMIA	r2!,{r4,r5,r6,r7}
    1.91 +	STMIA	r2!,{r4,r5,r6,r7}
    1.92 +	STMIA	r2!,{r4,r5,r6,r7}
    1.93 +oc_idct8x8_slow_arm_cols
    1.94 +; Column transforms
    1.95 +	BL	idct8core_down_arm
    1.96 +	BL	idct8core_down_arm
    1.97 +	BL	idct8core_down_arm
    1.98 +	BL	idct8core_down_arm
    1.99 +	BL	idct8core_down_arm
   1.100 +	BL	idct8core_down_arm
   1.101 +	BL	idct8core_down_arm
   1.102 +	BL	idct8core_down_arm
   1.103 +	ADD	r13,r13,#64*2
   1.104 +	LDMFD	r13!,{r4-r11,PC}
   1.105 +	ENDP
   1.106 +
   1.107 +oc_idct8x8_10_arm PROC
   1.108 +	STMFD	r13!,{r4-r11,r14}
   1.109 +	SUB	r13,r13,#64*2
   1.110 +; Row transforms
   1.111 +	MOV	r2, r0
   1.112 +	MOV	r0, r13		; Write to temp storage.
   1.113 +	BL	idct4core_arm
   1.114 +	BL	idct3core_arm
   1.115 +	BL	idct2core_arm
   1.116 +	BL	idct1core_arm
   1.117 +	; Clear input data for next block (decoder only).
   1.118 +	SUB	r0, r1, #4*16
   1.119 +	CMP	r0, r2
   1.120 +	MOV	r1, r13		; Read from temp storage.
   1.121 +	BEQ	oc_idct8x8_10_arm_cols
   1.122 +	MOV	r4, #0
   1.123 +	STR	r4, [r0]
   1.124 +	STR	r4, [r0,#4]
   1.125 +	STR	r4, [r0,#16]
   1.126 +	STR	r4, [r0,#20]
   1.127 +	STR	r4, [r0,#32]
   1.128 +	STR	r4, [r0,#48]
   1.129 +	MOV	r0, r2		; Write to the final destination
   1.130 +oc_idct8x8_10_arm_cols
   1.131 +; Column transforms
   1.132 +	BL	idct4core_down_arm
   1.133 +	BL	idct4core_down_arm
   1.134 +	BL	idct4core_down_arm
   1.135 +	BL	idct4core_down_arm
   1.136 +	BL	idct4core_down_arm
   1.137 +	BL	idct4core_down_arm
   1.138 +	BL	idct4core_down_arm
   1.139 +	BL	idct4core_down_arm
   1.140 +	ADD	r13,r13,#64*2
   1.141 +	LDMFD	r13!,{r4-r11,PC}
   1.142 +	ENDP
   1.143 +
   1.144 +oc_idct8x8_6_arm PROC
   1.145 +	STMFD	r13!,{r4-r7,r9-r11,r14}
   1.146 +	SUB	r13,r13,#64*2
   1.147 +; Row transforms
   1.148 +	MOV	r2, r0
   1.149 +	MOV	r0, r13		; Write to temp storage.
   1.150 +	BL	idct3core_arm
   1.151 +	BL	idct2core_arm
   1.152 +	BL	idct1core_arm
   1.153 +	; Clear input data for next block (decoder only).
   1.154 +	SUB	r0, r1, #3*16
   1.155 +	CMP	r0, r2
   1.156 +	MOV	r1, r13		; Read from temp storage.
   1.157 +	BEQ	oc_idct8x8_6_arm_cols
   1.158 +	MOV	r4, #0
   1.159 +	STR	r4, [r0]
   1.160 +	STR	r4, [r0,#4]
   1.161 +	STR	r4, [r0,#16]
   1.162 +	STR	r4, [r0,#32]
   1.163 +	MOV	r0, r2		; Write to the final destination
   1.164 +oc_idct8x8_6_arm_cols
   1.165 +; Column transforms
   1.166 +	BL	idct3core_down_arm
   1.167 +	BL	idct3core_down_arm
   1.168 +	BL	idct3core_down_arm
   1.169 +	BL	idct3core_down_arm
   1.170 +	BL	idct3core_down_arm
   1.171 +	BL	idct3core_down_arm
   1.172 +	BL	idct3core_down_arm
   1.173 +	BL	idct3core_down_arm
   1.174 +	ADD	r13,r13,#64*2
   1.175 +	LDMFD	r13!,{r4-r7,r9-r11,PC}
   1.176 +	ENDP
   1.177 +
   1.178 +oc_idct8x8_3_arm PROC
   1.179 +	STMFD	r13!,{r4-r7,r9-r11,r14}
   1.180 +	SUB	r13,r13,#64*2
   1.181 +; Row transforms
   1.182 +	MOV	r2, r0
   1.183 +	MOV	r0, r13		; Write to temp storage.
   1.184 +	BL	idct2core_arm
   1.185 +	BL	idct1core_arm
   1.186 +	; Clear input data for next block (decoder only).
   1.187 +	SUB	r0, r1, #2*16
   1.188 +	CMP	r0, r2
   1.189 +	MOV	r1, r13		; Read from temp storage.
   1.190 +	MOVNE	r4, #0
   1.191 +	STRNE	r4, [r0]
   1.192 +	STRNE	r4, [r0,#16]
   1.193 +	MOVNE	r0, r2		; Write to the final destination
   1.194 +; Column transforms
   1.195 +	BL	idct2core_down_arm
   1.196 +	BL	idct2core_down_arm
   1.197 +	BL	idct2core_down_arm
   1.198 +	BL	idct2core_down_arm
   1.199 +	BL	idct2core_down_arm
   1.200 +	BL	idct2core_down_arm
   1.201 +	BL	idct2core_down_arm
   1.202 +	BL	idct2core_down_arm
   1.203 +	ADD	r13,r13,#64*2
   1.204 +	LDMFD	r13!,{r4-r7,r9-r11,PC}
   1.205 +	ENDP
   1.206 +
   1.207 +idct1core_arm PROC
   1.208 +	; r0 =       ogg_int16_t *_y (destination)
   1.209 +	; r1 = const ogg_int16_t *_x (source)
   1.210 +	LDRSH	r3, [r1], #16
   1.211 +	MOV	r12,#0x05
   1.212 +	ORR	r12,r12,#0xB500
   1.213 +	MUL	r3, r12, r3
   1.214 +	; Stall ?
   1.215 +	MOV	r3, r3, ASR #16
   1.216 +	STRH	r3, [r0], #2
   1.217 +	STRH	r3, [r0, #14]
   1.218 +	STRH	r3, [r0, #30]
   1.219 +	STRH	r3, [r0, #46]
   1.220 +	STRH	r3, [r0, #62]
   1.221 +	STRH	r3, [r0, #78]
   1.222 +	STRH	r3, [r0, #94]
   1.223 +	STRH	r3, [r0, #110]
   1.224 +	MOV	PC,R14
   1.225 +	ENDP
   1.226 +
   1.227 +idct2core_arm PROC
   1.228 +	; r0 =       ogg_int16_t *_y (destination)
   1.229 +	; r1 = const ogg_int16_t *_x (source)
   1.230 +	LDRSH	r9, [r1], #16		; r9 = x[0]
   1.231 +	LDR	r12,OC_C4S4
   1.232 +	LDRSH	r11,[r1, #-14]		; r11= x[1]
   1.233 +	LDR	r3, OC_C7S1
   1.234 +	MUL	r9, r12,r9		; r9 = t[0]<<16 = OC_C4S4*x[0]
   1.235 +	LDR	r10,OC_C1S7
   1.236 +	MUL	r3, r11,r3		; r3 = t[4]<<16 = OC_C7S1*x[1]
   1.237 +	MOV	r9, r9, ASR #16		; r9 = t[0]
   1.238 +	MUL	r11,r10,r11		; r11= t[7]<<16 = OC_C1S7*x[1]
   1.239 +	MOV	r3, r3, ASR #16		; r3 = t[4]
   1.240 +	MUL	r10,r12,r3		; r10= t[5]<<16 = OC_C4S4*t[4]
   1.241 +	MOV	r11,r11,ASR #16		; r11= t[7]
   1.242 +	MUL	r12,r11,r12		; r12= t[6]<<16 = OC_C4S4*t[7]
   1.243 +	MOV	r10,r10,ASR #16		; r10= t[5]
   1.244 +	ADD	r12,r9,r12,ASR #16	; r12= t[0]+t[6]
   1.245 +	ADD	r12,r12,r10		; r12= t[0]+t2[6] = t[0]+t[6]+t[5]
   1.246 +	SUB	r10,r12,r10,LSL #1	; r10= t[0]+t2[5] = t[0]+t[6]-t[5]
   1.247 +	ADD	r3, r3, r9		; r3 = t[0]+t[4]
   1.248 +	ADD	r11,r11,r9		; r11= t[0]+t[7]
   1.249 +	STRH	r11,[r0], #2		; y[0] = t[0]+t[7]
   1.250 +	STRH	r12,[r0, #14]		; y[1] = t[0]+t[6]
   1.251 +	STRH	r10,[r0, #30]		; y[2] = t[0]+t[5]
   1.252 +	STRH	r3, [r0, #46]		; y[3] = t[0]+t[4]
   1.253 +	RSB	r3, r3, r9, LSL #1	; r3 = t[0]*2-(t[0]+t[4])=t[0]-t[4]
   1.254 +	RSB	r10,r10,r9, LSL #1	; r10= t[0]*2-(t[0]+t[5])=t[0]-t[5]
   1.255 +	RSB	r12,r12,r9, LSL #1	; r12= t[0]*2-(t[0]+t[6])=t[0]-t[6]
   1.256 +	RSB	r11,r11,r9, LSL #1	; r1 = t[0]*2-(t[0]+t[7])=t[0]-t[7]
   1.257 +	STRH	r3, [r0, #62]		; y[4] = t[0]-t[4]
   1.258 +	STRH	r10,[r0, #78]		; y[5] = t[0]-t[5]
   1.259 +	STRH	r12,[r0, #94]		; y[6] = t[0]-t[6]
   1.260 +	STRH	r11,[r0, #110]		; y[7] = t[0]-t[7]
   1.261 +	MOV	PC,r14
   1.262 +	ENDP
   1.263 +
   1.264 +idct2core_down_arm PROC
   1.265 +	; r0 =       ogg_int16_t *_y (destination)
   1.266 +	; r1 = const ogg_int16_t *_x (source)
   1.267 +	LDRSH	r9, [r1], #16		; r9 = x[0]
   1.268 +	LDR	r12,OC_C4S4
   1.269 +	LDRSH	r11,[r1, #-14]		; r11= x[1]
   1.270 +	LDR	r3, OC_C7S1
   1.271 +	MUL	r9, r12,r9		; r9 = t[0]<<16 = OC_C4S4*x[0]
   1.272 +	LDR	r10,OC_C1S7
   1.273 +	MUL	r3, r11,r3		; r3 = t[4]<<16 = OC_C7S1*x[1]
   1.274 +	MOV	r9, r9, ASR #16		; r9 = t[0]
   1.275 +	MUL	r11,r10,r11		; r11= t[7]<<16 = OC_C1S7*x[1]
   1.276 +	ADD	r9, r9, #8		; r9 = t[0]+8
   1.277 +	MOV	r3, r3, ASR #16		; r3 = t[4]
   1.278 +	MUL	r10,r12,r3		; r10= t[5]<<16 = OC_C4S4*t[4]
   1.279 +	MOV	r11,r11,ASR #16		; r11= t[7]
   1.280 +	MUL	r12,r11,r12		; r12= t[6]<<16 = OC_C4S4*t[7]
   1.281 +	MOV	r10,r10,ASR #16		; r10= t[5]
   1.282 +	ADD	r12,r9,r12,ASR #16	; r12= t[0]+t[6]+8
   1.283 +	ADD	r12,r12,r10		; r12= t[0]+t2[6] = t[0]+t[6]+t[5]+8
   1.284 +	SUB	r10,r12,r10,LSL #1	; r10= t[0]+t2[5] = t[0]+t[6]-t[5]+8
   1.285 +	ADD	r3, r3, r9		; r3 = t[0]+t[4]+8
   1.286 +	ADD	r11,r11,r9		; r11= t[0]+t[7]+8
   1.287 +	; TODO: This is wrong.
   1.288 +	; The C code truncates to 16 bits by storing to RAM and doing the
   1.289 +	;  shifts later; we've got an extra 4 bits here.
   1.290 +	MOV	r4, r11,ASR #4
   1.291 +	MOV	r5, r12,ASR #4
   1.292 +	MOV	r6, r10,ASR #4
   1.293 +	MOV	r7, r3, ASR #4
   1.294 +	RSB	r3, r3, r9, LSL #1	;r3 =t[0]*2+8-(t[0]+t[4])=t[0]-t[4]+8
   1.295 +	RSB	r10,r10,r9, LSL #1	;r10=t[0]*2+8-(t[0]+t[5])=t[0]-t[5]+8
   1.296 +	RSB	r12,r12,r9, LSL #1	;r12=t[0]*2+8-(t[0]+t[6])=t[0]-t[6]+8
   1.297 +	RSB	r11,r11,r9, LSL #1	;r11=t[0]*2+8-(t[0]+t[7])=t[0]-t[7]+8
   1.298 +	MOV	r3, r3, ASR #4
   1.299 +	MOV	r10,r10,ASR #4
   1.300 +	MOV	r12,r12,ASR #4
   1.301 +	MOV	r11,r11,ASR #4
   1.302 +	STRH	r4, [r0], #2		; y[0] = t[0]+t[7]
   1.303 +	STRH	r5, [r0, #14]		; y[1] = t[0]+t[6]
   1.304 +	STRH	r6, [r0, #30]		; y[2] = t[0]+t[5]
   1.305 +	STRH	r7, [r0, #46]		; y[3] = t[0]+t[4]
   1.306 +	STRH	r3, [r0, #62]		; y[4] = t[0]-t[4]
   1.307 +	STRH	r10,[r0, #78]		; y[5] = t[0]-t[5]
   1.308 +	STRH	r12,[r0, #94]		; y[6] = t[0]-t[6]
   1.309 +	STRH	r11,[r0, #110]		; y[7] = t[0]-t[7]
   1.310 +	MOV	PC,r14
   1.311 +	ENDP
   1.312 +
   1.313 +idct3core_arm PROC
   1.314 +	LDRSH	r9, [r1], #16		; r9 = x[0]
   1.315 +	LDR	r12,OC_C4S4		; r12= OC_C4S4
   1.316 +	LDRSH	r3, [r1, #-12]		; r3 = x[2]
   1.317 +	LDR	r10,OC_C6S2		; r10= OC_C6S2
   1.318 +	MUL	r9, r12,r9		; r9 = t[0]<<16 = OC_C4S4*x[0]
   1.319 +	LDR	r4, OC_C2S6		; r4 = OC_C2S6
   1.320 +	MUL	r10,r3, r10		; r10= t[2]<<16 = OC_C6S2*x[2]
   1.321 +	LDRSH	r11,[r1, #-14]		; r11= x[1]
   1.322 +	MUL	r3, r4, r3		; r3 = t[3]<<16 = OC_C2S6*x[2]
   1.323 +	LDR	r4, OC_C7S1		; r4 = OC_C7S1
   1.324 +	LDR	r5, OC_C1S7		; r5 = OC_C1S7
   1.325 +	MOV	r9, r9, ASR #16		; r9 = t[0]
   1.326 +	MUL	r4, r11,r4		; r4 = t[4]<<16 = OC_C7S1*x[1]
   1.327 +	ADD	r3, r9, r3, ASR #16	; r3 = t[0]+t[3]
   1.328 +	MUL	r11,r5, r11		; r11= t[7]<<16 = OC_C1S7*x[1]
   1.329 +	MOV	r4, r4, ASR #16		; r4 = t[4]
   1.330 +	MUL	r5, r12,r4		; r5 = t[5]<<16 = OC_C4S4*t[4]
   1.331 +	MOV	r11,r11,ASR #16		; r11= t[7]
   1.332 +	MUL	r12,r11,r12		; r12= t[6]<<16 = OC_C4S4*t[7]
   1.333 +	ADD	r10,r9, r10,ASR #16	; r10= t[1] = t[0]+t[2]
   1.334 +	RSB	r6, r10,r9, LSL #1	; r6 = t[2] = t[0]-t[2]
   1.335 +					; r3 = t2[0] = t[0]+t[3]
   1.336 +	RSB	r9, r3, r9, LSL #1	; r9 = t2[3] = t[0]-t[3]
   1.337 +	MOV	r12,r12,ASR #16		; r12= t[6]
   1.338 +	ADD	r5, r12,r5, ASR #16	; r5 = t2[6] = t[6]+t[5]
   1.339 +	RSB	r12,r5, r12,LSL #1	; r12= t2[5] = t[6]-t[5]
   1.340 +	ADD	r11,r3, r11		; r11= t2[0]+t[7]
   1.341 +	ADD	r5, r10,r5		; r5 = t[1]+t2[6]
   1.342 +	ADD	r12,r6, r12		; r12= t[2]+t2[5]
   1.343 +	ADD	r4, r9, r4		; r4 = t2[3]+t[4]
   1.344 +	STRH	r11,[r0], #2		; y[0] = t[0]+t[7]
   1.345 +	STRH	r5, [r0, #14]		; y[1] = t[1]+t2[6]
   1.346 +	STRH	r12,[r0, #30]		; y[2] = t[2]+t2[5]
   1.347 +	STRH	r4, [r0, #46]		; y[3] = t2[3]+t[4]
   1.348 +	RSB	r11,r11,r3, LSL #1	; r11= t2[0] - t[7]
   1.349 +	RSB	r5, r5, r10,LSL #1	; r5 = t[1]  - t2[6]
   1.350 +	RSB	r12,r12,r6, LSL #1	; r6 = t[2]  - t2[5]
   1.351 +	RSB	r4, r4, r9, LSL #1	; r4 = t2[3] - t[4]
   1.352 +	STRH	r4, [r0, #62]		; y[4] = t2[3]-t[4]
   1.353 +	STRH	r12,[r0, #78]		; y[5] = t[2]-t2[5]
   1.354 +	STRH	r5, [r0, #94]		; y[6] = t[1]-t2[6]
   1.355 +	STRH	r11,[r0, #110]		; y[7] = t2[0]-t[7]
   1.356 +	MOV	PC,R14
   1.357 +	ENDP
   1.358 +
   1.359 +idct3core_down_arm PROC
   1.360 +	LDRSH	r9, [r1], #16		; r9 = x[0]
   1.361 +	LDR	r12,OC_C4S4		; r12= OC_C4S4
   1.362 +	LDRSH	r3, [r1, #-12]		; r3 = x[2]
   1.363 +	LDR	r10,OC_C6S2		; r10= OC_C6S2
   1.364 +	MUL	r9, r12,r9		; r9 = t[0]<<16 = OC_C4S4*x[0]
   1.365 +	LDR	r4, OC_C2S6		; r4 = OC_C2S6
   1.366 +	MUL	r10,r3, r10		; r10= t[2]<<16 = OC_C6S2*x[2]
   1.367 +	LDRSH	r11,[r1, #-14]		; r11= x[1]
   1.368 +	MUL	r3, r4, r3		; r3 = t[3]<<16 = OC_C2S6*x[2]
   1.369 +	LDR	r4, OC_C7S1		; r4 = OC_C7S1
   1.370 +	LDR	r5, OC_C1S7		; r5 = OC_C1S7
   1.371 +	MOV	r9, r9, ASR #16		; r9 = t[0]
   1.372 +	MUL	r4, r11,r4		; r4 = t[4]<<16 = OC_C7S1*x[1]
   1.373 +	ADD	r9, r9, #8		; r9 = t[0]+8
   1.374 +	MUL	r11,r5, r11		; r11= t[7]<<16 = OC_C1S7*x[1]
   1.375 +	ADD	r3, r9, r3, ASR #16	; r3 = t[0]+t[3]+8
   1.376 +	MOV	r4, r4, ASR #16		; r4 = t[4]
   1.377 +	MUL	r5, r12,r4		; r5 = t[5]<<16 = OC_C4S4*t[4]
   1.378 +	MOV	r11,r11,ASR #16		; r11= t[7]
   1.379 +	MUL	r12,r11,r12		; r12= t[6]<<16 = OC_C4S4*t[7]
   1.380 +	ADD	r10,r9, r10,ASR #16	; r10= t[1]+8 = t[0]+t[2]+8
   1.381 +	RSB	r6, r10,r9, LSL #1	; r6 = t[2]+8 = t[0]-t[2]+8
   1.382 +					; r3 = t2[0]+8 = t[0]+t[3]+8
   1.383 +	RSB	r9, r3, r9, LSL #1	; r9 = t2[3]+8 = t[0]-t[3]+8
   1.384 +	MOV	r12,r12,ASR #16		; r12= t[6]
   1.385 +	ADD	r5, r12,r5, ASR #16	; r5 = t2[6] = t[6]+t[5]
   1.386 +	RSB	r12,r5, r12,LSL #1	; r12= t2[5] = t[6]-t[5]
   1.387 +	ADD	r11,r3, r11		; r11= t2[0]+t[7] +8
   1.388 +	ADD	r5, r10,r5		; r5 = t[1] +t2[6]+8
   1.389 +	ADD	r12,r6, r12		; r12= t[2] +t2[5]+8
   1.390 +	ADD	r4, r9, r4		; r4 = t2[3]+t[4] +8
   1.391 +	RSB	r3, r11,r3, LSL #1	; r11= t2[0] - t[7]  + 8
   1.392 +	RSB	r10,r5, r10,LSL #1	; r5 = t[1]  - t2[6] + 8
   1.393 +	RSB	r6, r12,r6, LSL #1	; r6 = t[2]  - t2[5] + 8
   1.394 +	RSB	r9, r4, r9, LSL #1	; r4 = t2[3] - t[4]  + 8
   1.395 +	; TODO: This is wrong.
   1.396 +	; The C code truncates to 16 bits by storing to RAM and doing the
   1.397 +	;  shifts later; we've got an extra 4 bits here.
   1.398 +	MOV	r11,r11,ASR #4
   1.399 +	MOV	r5, r5, ASR #4
   1.400 +	MOV	r12,r12,ASR #4
   1.401 +	MOV	r4, r4, ASR #4
   1.402 +	MOV	r9, r9, ASR #4
   1.403 +	MOV	r6, r6, ASR #4
   1.404 +	MOV	r10,r10,ASR #4
   1.405 +	MOV	r3, r3, ASR #4
   1.406 +	STRH	r11,[r0], #2		; y[0] = t[0]+t[7]
   1.407 +	STRH	r5, [r0, #14]		; y[1] = t[1]+t2[6]
   1.408 +	STRH	r12,[r0, #30]		; y[2] = t[2]+t2[5]
   1.409 +	STRH	r4, [r0, #46]		; y[3] = t2[3]+t[4]
   1.410 +	STRH	r9, [r0, #62]		; y[4] = t2[3]-t[4]
   1.411 +	STRH	r6, [r0, #78]		; y[5] = t[2]-t2[5]
   1.412 +	STRH	r10,[r0, #94]		; y[6] = t[1]-t2[6]
   1.413 +	STRH	r3, [r0, #110]		; y[7] = t2[0]-t[7]
   1.414 +	MOV	PC,R14
   1.415 +	ENDP
   1.416 +
   1.417 +idct4core_arm PROC
   1.418 +	; r0 =       ogg_int16_t *_y (destination)
   1.419 +	; r1 = const ogg_int16_t *_x (source)
   1.420 +	LDRSH	r9, [r1], #16		; r9 = x[0]
   1.421 +	LDR	r10,OC_C4S4		; r10= OC_C4S4
   1.422 +	LDRSH	r12,[r1, #-12]		; r12= x[2]
   1.423 +	LDR	r4, OC_C6S2		; r4 = OC_C6S2
   1.424 +	MUL	r9, r10,r9		; r9 = t[0]<<16 = OC_C4S4*x[0]
   1.425 +	LDR	r5, OC_C2S6		; r5 = OC_C2S6
   1.426 +	MUL	r4, r12,r4		; r4 = t[2]<<16 = OC_C6S2*x[2]
   1.427 +	LDRSH	r3, [r1, #-14]		; r3 = x[1]
   1.428 +	MUL	r5, r12,r5		; r5 = t[3]<<16 = OC_C2S6*x[2]
   1.429 +	LDR	r6, OC_C7S1		; r6 = OC_C7S1
   1.430 +	LDR	r12,OC_C1S7		; r12= OC_C1S7
   1.431 +	LDRSH	r11,[r1, #-10]		; r11= x[3]
   1.432 +	MUL	r6, r3, r6		; r6 = t[4]<<16 = OC_C7S1*x[1]
   1.433 +	LDR	r7, OC_C5S3		; r7 = OC_C5S3
   1.434 +	MUL	r3, r12,r3		; r3 = t[7]<<16 = OC_C1S7*x[1]
   1.435 +	LDR	r8, OC_C3S5		; r8 = OC_C3S5
   1.436 +	MUL	r7, r11,r7		; r7 = -t[5]<<16 = OC_C5S3*x[3]
   1.437 +	MOV	r9, r9, ASR #16		; r9 = t[0]
   1.438 +	MUL	r11,r8, r11		; r11= t[6]<<16 = OC_C3S5*x[3]
   1.439 +	MOV	r6, r6, ASR #16		; r6 = t[4]
   1.440 +; TODO: This is wrong; t[4]-t[5] and t[7]-t[6] need to be truncated to 16-bit
   1.441 +; before multiplying, not after (this is not equivalent)
   1.442 +	SUB	r7, r6, r7, ASR #16	; r7 = t2[4]=t[4]+t[5] (as r7=-t[5])
   1.443 +	RSB	r6, r7, r6, LSL #1	; r6 = t[4]-t[5]
   1.444 +	MUL	r6, r10,r6		; r6 = t2[5]<<16 =OC_C4S4*(t[4]-t[5])
   1.445 +	MOV	r3, r3, ASR #16		; r3 = t[7]
   1.446 +	ADD	r11,r3, r11,ASR #16	; r11= t2[7]=t[7]+t[6]
   1.447 +	RSB	r3, r11,r3, LSL #1	; r3 = t[7]-t[6]
   1.448 +	MUL	r3, r10,r3		; r3 = t2[6]<<16 =OC_C4S4*(t[7]-t[6])
   1.449 +	ADD	r4, r9, r4, ASR #16	; r4 = t[1] = t[0] + t[2]
   1.450 +	RSB	r10,r4, r9, LSL #1	; r10= t[2] = t[0] - t[2]
   1.451 +	ADD	r5, r9, r5, ASR #16	; r5 = t[0] = t[0] + t[3]
   1.452 +	RSB	r9, r5, r9, LSL #1	; r9 = t[3] = t[0] - t[3]
   1.453 +	MOV	r3, r3, ASR #16		; r3 = t2[6]
   1.454 +	ADD	r6, r3, r6, ASR #16	; r6 = t3[6] = t2[6]+t2[5]
   1.455 +	RSB	r3, r6, r3, LSL #1	; r3 = t3[5] = t2[6]-t2[5]
   1.456 +	ADD	r11,r5, r11		; r11= t[0]+t2[7]
   1.457 +	ADD	r6, r4, r6		; r6 = t[1]+t3[6]
   1.458 +	ADD	r3, r10,r3		; r3 = t[2]+t3[5]
   1.459 +	ADD	r7, r9, r7		; r7 = t[3]+t2[4]
   1.460 +	STRH	r11,[r0], #2		; y[0] = t[0]+t[7]
   1.461 +	STRH	r6, [r0, #14]		; y[1] = t[1]+t2[6]
   1.462 +	STRH	r3, [r0, #30]		; y[2] = t[2]+t2[5]
   1.463 +	STRH	r7, [r0, #46]		; y[3] = t2[3]+t[4]
   1.464 +	RSB	r11,r11,r5, LSL #1	; r11= t[0]-t2[7]
   1.465 +	RSB	r6, r6, r4, LSL #1	; r6 = t[1]-t3[6]
   1.466 +	RSB	r3, r3, r10,LSL #1	; r3 = t[2]-t3[5]
   1.467 +	RSB	r7, r7, r9, LSL #1	; r7 = t[3]-t2[4]
   1.468 +	STRH	r7, [r0, #62]		; y[4] = t2[3]-t[4]
   1.469 +	STRH	r3, [r0, #78]		; y[5] = t[2]-t2[5]
   1.470 +	STRH	r6, [r0, #94]		; y[6] = t[1]-t2[6]
   1.471 +	STRH	r11, [r0, #110]		; y[7] = t2[0]-t[7]
   1.472 +	MOV	PC,r14
   1.473 +	ENDP
   1.474 +
   1.475 +idct4core_down_arm PROC
   1.476 +	; r0 =       ogg_int16_t *_y (destination)
   1.477 +	; r1 = const ogg_int16_t *_x (source)
   1.478 +	LDRSH	r9, [r1], #16		; r9 = x[0]
   1.479 +	LDR	r10,OC_C4S4		; r10= OC_C4S4
   1.480 +	LDRSH	r12,[r1, #-12]		; r12= x[2]
   1.481 +	LDR	r4, OC_C6S2		; r4 = OC_C6S2
   1.482 +	MUL	r9, r10,r9		; r9 = t[0]<<16 = OC_C4S4*x[0]
   1.483 +	LDR	r5, OC_C2S6		; r5 = OC_C2S6
   1.484 +	MUL	r4, r12,r4		; r4 = t[2]<<16 = OC_C6S2*x[2]
   1.485 +	LDRSH	r3, [r1, #-14]		; r3 = x[1]
   1.486 +	MUL	r5, r12,r5		; r5 = t[3]<<16 = OC_C2S6*x[2]
   1.487 +	LDR	r6, OC_C7S1		; r6 = OC_C7S1
   1.488 +	LDR	r12,OC_C1S7		; r12= OC_C1S7
   1.489 +	LDRSH	r11,[r1, #-10]		; r11= x[3]
   1.490 +	MUL	r6, r3, r6		; r6 = t[4]<<16 = OC_C7S1*x[1]
   1.491 +	LDR	r7, OC_C5S3		; r7 = OC_C5S3
   1.492 +	MUL	r3, r12,r3		; r3 = t[7]<<16 = OC_C1S7*x[1]
   1.493 +	LDR	r8, OC_C3S5		; r8 = OC_C3S5
   1.494 +	MUL	r7, r11,r7		; r7 = -t[5]<<16 = OC_C5S3*x[3]
   1.495 +	MOV	r9, r9, ASR #16		; r9 = t[0]
   1.496 +	MUL	r11,r8, r11		; r11= t[6]<<16 = OC_C3S5*x[3]
   1.497 +	MOV	r6, r6, ASR #16		; r6 = t[4]
   1.498 +; TODO: This is wrong; t[4]-t[5] and t[7]-t[6] need to be truncated to 16-bit
   1.499 +; before multiplying, not after (this is not equivalent)
   1.500 +	SUB	r7, r6, r7, ASR #16	; r7 = t2[4]=t[4]+t[5] (as r7=-t[5])
   1.501 +	RSB	r6, r7, r6, LSL #1	; r6 = t[4]-t[5]
   1.502 +	MUL	r6, r10,r6		; r6 = t2[5]<<16 =OC_C4S4*(t[4]-t[5])
   1.503 +	MOV	r3, r3, ASR #16		; r3 = t[7]
   1.504 +	ADD	r11,r3, r11,ASR #16	; r11= t2[7]=t[7]+t[6]
   1.505 +	RSB	r3, r11,r3, LSL #1	; r3 = t[7]-t[6]
   1.506 +	ADD	r9, r9, #8		; r9 = t[0]+8
   1.507 +	MUL	r3, r10,r3		; r3 = t2[6]<<16 =OC_C4S4*(t[7]-t[6])
   1.508 +	ADD	r4, r9, r4, ASR #16	; r4 = t[1] = t[0] + t[2] + 8
   1.509 +	RSB	r10,r4, r9, LSL #1	; r10= t[2] = t[0] - t[2] + 8
   1.510 +	ADD	r5, r9, r5, ASR #16	; r5 = t[0] = t[0] + t[3] + 8
   1.511 +	RSB	r9, r5, r9, LSL #1	; r9 = t[3] = t[0] - t[3] + 8
   1.512 +	MOV	r3, r3, ASR #16		; r3 = t2[6]
   1.513 +	ADD	r6, r3, r6, ASR #16	; r6 = t3[6] = t2[6]+t2[5]
   1.514 +	RSB	r3, r6, r3, LSL #1	; r3 = t3[5] = t2[6]-t2[5]
   1.515 +	ADD	r5, r5, r11		; r5 = t[0]+t2[7]+8
   1.516 +	ADD	r4, r4, r6		; r4 = t[1]+t3[6]+8
   1.517 +	ADD	r10,r10,r3		; r10= t[2]+t3[5]+8
   1.518 +	ADD	r9, r9, r7		; r9 = t[3]+t2[4]+8
   1.519 +	SUB	r11,r5, r11,LSL #1	; r11= t[0]-t2[7]+8
   1.520 +	SUB	r6, r4, r6, LSL #1	; r6 = t[1]-t3[6]+8
   1.521 +	SUB	r3, r10,r3, LSL #1	; r3 = t[2]-t3[5]+8
   1.522 +	SUB	r7, r9, r7, LSL #1	; r7 = t[3]-t2[4]+8
   1.523 +	; TODO: This is wrong.
   1.524 +	; The C code truncates to 16 bits by storing to RAM and doing the
   1.525 +	;  shifts later; we've got an extra 4 bits here.
   1.526 +	MOV	r11,r11,ASR #4
   1.527 +	MOV	r6, r6, ASR #4
   1.528 +	MOV	r3, r3, ASR #4
   1.529 +	MOV	r7, r7, ASR #4
   1.530 +	MOV	r9, r9, ASR #4
   1.531 +	MOV	r10,r10,ASR #4
   1.532 +	MOV	r4, r4, ASR #4
   1.533 +	MOV	r5, r5, ASR #4
   1.534 +	STRH	r5,[r0], #2		; y[0] = t[0]+t[7]
   1.535 +	STRH	r4, [r0, #14]		; y[1] = t[1]+t2[6]
   1.536 +	STRH	r10,[r0, #30]		; y[2] = t[2]+t2[5]
   1.537 +	STRH	r9, [r0, #46]		; y[3] = t2[3]+t[4]
   1.538 +	STRH	r7, [r0, #62]		; y[4] = t2[3]-t[4]
   1.539 +	STRH	r3, [r0, #78]		; y[5] = t[2]-t2[5]
   1.540 +	STRH	r6, [r0, #94]		; y[6] = t[1]-t2[6]
   1.541 +	STRH	r11,[r0, #110]		; y[7] = t2[0]-t[7]
   1.542 +	MOV	PC,r14
   1.543 +	ENDP
   1.544 +
   1.545 +idct8core_arm PROC
   1.546 +	; r0 =       ogg_int16_t *_y (destination)
   1.547 +	; r1 = const ogg_int16_t *_x (source)
   1.548 +	LDRSH	r2, [r1],#16		; r2 = x[0]
   1.549 +	STMFD	r13!,{r1,r14}
   1.550 +	LDRSH	r6, [r1, #-8]		; r6 = x[4]
   1.551 +	LDR	r12,OC_C4S4		; r12= C4S4
   1.552 +	LDRSH	r4, [r1, #-12]		; r4 = x[2]
   1.553 +	ADD	r2, r2, r6		; r2 = x[0] + x[4]
   1.554 +	SUB	r6, r2, r6, LSL #1	; r6 = x[0] - x[4]
   1.555 +	; For spec compliance, these sums must be truncated to 16-bit precision
   1.556 +	; _before_ the multiply (not after).
   1.557 +	; Sadly, ARMv4 provides no simple way to do that.
   1.558 +	MOV	r2, r2, LSL #16
   1.559 +	MOV	r6, r6, LSL #16
   1.560 +	MOV	r2, r2, ASR #16
   1.561 +	MOV	r6, r6, ASR #16
   1.562 +	MUL	r2, r12,r2		; r2 = t[0]<<16 = C4S4*(x[0]+x[4])
   1.563 +	LDRSH	r8, [r1, #-4]		; r8 = x[6]
   1.564 +	LDR	r7, OC_C6S2		; r7 = OC_C6S2
   1.565 +	MUL	r6, r12,r6		; r6 = t[1]<<16 = C4S4*(x[0]-x[4])
   1.566 +	LDR	r14,OC_C2S6		; r14= OC_C2S6
   1.567 +	MUL	r3, r4, r7		; r3 = OC_C6S2*x[2]
   1.568 +	LDR	r5, OC_C7S1		; r5 = OC_C7S1
   1.569 +	MUL	r4, r14,r4		; r4 = OC_C2S6*x[2]
   1.570 +	MOV	r3, r3, ASR #16		; r3 = OC_C6S2*x[2]>>16
   1.571 +	MUL	r14,r8, r14		; r14= OC_C2S6*x[6]
   1.572 +	MOV	r4, r4, ASR #16		; r4 = OC_C2S6*x[2]>>16
   1.573 +	MUL	r8, r7, r8		; r8 = OC_C6S2*x[6]
   1.574 +	LDR	r7, OC_C1S7		; r7 = OC_C1S7
   1.575 +	SUB	r3, r3, r14,ASR #16	; r3=t[2]=C6S2*x[2]>>16-C2S6*x[6]>>16
   1.576 +	LDRSH	r14,[r1, #-14]		; r14= x[1]
   1.577 +	ADD	r4, r4, r8, ASR #16	; r4=t[3]=C2S6*x[2]>>16+C6S2*x[6]>>16
   1.578 +	LDRSH	r8, [r1, #-2]		; r8 = x[7]
   1.579 +	MUL	r9, r5, r14		; r9 = OC_C7S1*x[1]
   1.580 +	LDRSH	r10,[r1, #-6]		; r10= x[5]
   1.581 +	MUL	r14,r7, r14		; r14= OC_C1S7*x[1]
   1.582 +	MOV	r9, r9, ASR #16		; r9 = OC_C7S1*x[1]>>16
   1.583 +	MUL	r7, r8, r7		; r7 = OC_C1S7*x[7]
   1.584 +	MOV	r14,r14,ASR #16		; r14= OC_C1S7*x[1]>>16
   1.585 +	MUL	r8, r5, r8		; r8 = OC_C7S1*x[7]
   1.586 +	LDRSH	r1, [r1, #-10]		; r1 = x[3]
   1.587 +	LDR	r5, OC_C3S5		; r5 = OC_C3S5
   1.588 +	LDR	r11,OC_C5S3		; r11= OC_C5S3
   1.589 +	ADD	r8, r14,r8, ASR #16	; r8=t[7]=C1S7*x[1]>>16+C7S1*x[7]>>16
   1.590 +	MUL	r14,r5, r10		; r14= OC_C3S5*x[5]
   1.591 +	SUB	r9, r9, r7, ASR #16	; r9=t[4]=C7S1*x[1]>>16-C1S7*x[7]>>16
   1.592 +	MUL	r10,r11,r10		; r10= OC_C5S3*x[5]
   1.593 +	MOV	r14,r14,ASR #16		; r14= OC_C3S5*x[5]>>16
   1.594 +	MUL	r11,r1, r11		; r11= OC_C5S3*x[3]
   1.595 +	MOV	r10,r10,ASR #16		; r10= OC_C5S3*x[5]>>16
   1.596 +	MUL	r1, r5, r1		; r1 = OC_C3S5*x[3]
   1.597 +	SUB	r14,r14,r11,ASR #16	;r14=t[5]=C3S5*x[5]>>16-C5S3*x[3]>>16
   1.598 +	ADD	r10,r10,r1, ASR #16	;r10=t[6]=C5S3*x[5]>>16+C3S5*x[3]>>16
   1.599 +	; r2=t[0]<<16 r3=t[2] r4=t[3] r6=t[1]<<16 r8=t[7] r9=t[4]
   1.600 +	; r10=t[6] r12=C4S4 r14=t[5]
   1.601 +; TODO: This is wrong; t[4]-t[5] and t[7]-t[6] need to be truncated to 16-bit
   1.602 +; before multiplying, not after (this is not equivalent)
   1.603 +	; Stage 2
   1.604 +	; 4-5 butterfly
   1.605 +	ADD	r9, r9, r14		; r9 = t2[4]     =       t[4]+t[5]
   1.606 +	SUB	r14,r9, r14, LSL #1	; r14=                   t[4]-t[5]
   1.607 +	MUL	r14,r12,r14		; r14= t2[5]<<16 = C4S4*(t[4]-t[5])
   1.608 +	; 7-6 butterfly
   1.609 +	ADD	r8, r8, r10		; r8 = t2[7]     =       t[7]+t[6]
   1.610 +	SUB	r10,r8, r10, LSL #1	; r10=                   t[7]-t[6]
   1.611 +	MUL	r10,r12,r10		; r10= t2[6]<<16 = C4S4*(t[7]+t[6])
   1.612 +	; r2=t[0]<<16 r3=t[2] r4=t[3] r6=t[1]<<16 r8=t2[7] r9=t2[4]
   1.613 +	; r10=t2[6]<<16 r12=C4S4 r14=t2[5]<<16
   1.614 +	; Stage 3
   1.615 +	; 0-3 butterfly
   1.616 +	ADD	r2, r4, r2, ASR #16	; r2 = t2[0] = t[0] + t[3]
   1.617 +	SUB	r4, r2, r4, LSL #1	; r4 = t2[3] = t[0] - t[3]
   1.618 +	; 1-2 butterfly
   1.619 +	ADD	r6, r3, r6, ASR #16	; r6 = t2[1] = t[1] + t[2]
   1.620 +	SUB	r3, r6, r3, LSL #1	; r3 = t2[2] = t[1] - t[2]
   1.621 +	; 6-5 butterfly
   1.622 +	MOV	r14,r14,ASR #16		; r14= t2[5]
   1.623 +	ADD	r10,r14,r10,ASR #16	; r10= t3[6] = t[6] + t[5]
   1.624 +	SUB	r14,r10,r14,LSL #1	; r14= t3[5] = t[6] - t[5]
   1.625 +	; r2=t2[0] r3=t2[2] r4=t2[3] r6=t2[1] r8=t2[7] r9=t2[4]
   1.626 +	; r10=t3[6] r14=t3[5]
   1.627 +	; Stage 4
   1.628 +	ADD	r2, r2, r8		; r2 = t[0] + t[7]
   1.629 +	ADD	r6, r6, r10		; r6 = t[1] + t[6]
   1.630 +	ADD	r3, r3, r14		; r3 = t[2] + t[5]
   1.631 +	ADD	r4, r4, r9		; r4 = t[3] + t[4]
   1.632 +	SUB	r8, r2, r8, LSL #1	; r8 = t[0] - t[7]
   1.633 +	SUB	r10,r6, r10,LSL #1	; r10= t[1] - t[6]
   1.634 +	SUB	r14,r3, r14,LSL #1	; r14= t[2] - t[5]
   1.635 +	SUB	r9, r4, r9, LSL #1	; r9 = t[3] - t[4]
   1.636 +	STRH	r2, [r0], #2		; y[0] = t[0]+t[7]
   1.637 +	STRH	r6, [r0, #14]		; y[1] = t[1]+t[6]
   1.638 +	STRH	r3, [r0, #30]		; y[2] = t[2]+t[5]
   1.639 +	STRH	r4, [r0, #46]		; y[3] = t[3]+t[4]
   1.640 +	STRH	r9, [r0, #62]		; y[4] = t[3]-t[4]
   1.641 +	STRH	r14,[r0, #78]		; y[5] = t[2]-t[5]
   1.642 +	STRH	r10,[r0, #94]		; y[6] = t[1]-t[6]
   1.643 +	STRH	r8, [r0, #110]		; y[7] = t[0]-t[7]
   1.644 +	LDMFD	r13!,{r1,PC}
   1.645 +	ENDP
   1.646 +
   1.647 +idct8core_down_arm PROC
   1.648 +	; r0 =       ogg_int16_t *_y (destination)
   1.649 +	; r1 = const ogg_int16_t *_x (source)
   1.650 +	LDRSH	r2, [r1],#16		; r2 = x[0]
   1.651 +	STMFD	r13!,{r1,r14}
   1.652 +	LDRSH	r6, [r1, #-8]		; r6 = x[4]
   1.653 +	LDR	r12,OC_C4S4		; r12= C4S4
   1.654 +	LDRSH	r4, [r1, #-12]		; r4 = x[2]
   1.655 +	ADD	r2, r2, r6		; r2 = x[0] + x[4]
   1.656 +	SUB	r6, r2, r6, LSL #1	; r6 = x[0] - x[4]
   1.657 +	; For spec compliance, these sums must be truncated to 16-bit precision
   1.658 +	; _before_ the multiply (not after).
   1.659 +	; Sadly, ARMv4 provides no simple way to do that.
   1.660 +	MOV	r2, r2, LSL #16
   1.661 +	MOV	r6, r6, LSL #16
   1.662 +	MOV	r2, r2, ASR #16
   1.663 +	MOV	r6, r6, ASR #16
   1.664 +	MUL	r2, r12,r2		; r2 = t[0]<<16 = C4S4*(x[0]+x[4])
   1.665 +	LDRSH	r8, [r1, #-4]		; r8 = x[6]
   1.666 +	LDR	r7, OC_C6S2		; r7 = OC_C6S2
   1.667 +	MUL	r6, r12,r6		; r6 = t[1]<<16 = C4S4*(x[0]-x[4])
   1.668 +	LDR	r14,OC_C2S6		; r14= OC_C2S6
   1.669 +	MUL	r3, r4, r7		; r3 = OC_C6S2*x[2]
   1.670 +	LDR	r5, OC_C7S1		; r5 = OC_C7S1
   1.671 +	MUL	r4, r14,r4		; r4 = OC_C2S6*x[2]
   1.672 +	MOV	r3, r3, ASR #16		; r3 = OC_C6S2*x[2]>>16
   1.673 +	MUL	r14,r8, r14		; r14= OC_C2S6*x[6]
   1.674 +	MOV	r4, r4, ASR #16		; r4 = OC_C2S6*x[2]>>16
   1.675 +	MUL	r8, r7, r8		; r8 = OC_C6S2*x[6]
   1.676 +	LDR	r7, OC_C1S7		; r7 = OC_C1S7
   1.677 +	SUB	r3, r3, r14,ASR #16	; r3=t[2]=C6S2*x[2]>>16-C2S6*x[6]>>16
   1.678 +	LDRSH	r14,[r1, #-14]		; r14= x[1]
   1.679 +	ADD	r4, r4, r8, ASR #16	; r4=t[3]=C2S6*x[2]>>16+C6S2*x[6]>>16
   1.680 +	LDRSH	r8, [r1, #-2]		; r8 = x[7]
   1.681 +	MUL	r9, r5, r14		; r9 = OC_C7S1*x[1]
   1.682 +	LDRSH	r10,[r1, #-6]		; r10= x[5]
   1.683 +	MUL	r14,r7, r14		; r14= OC_C1S7*x[1]
   1.684 +	MOV	r9, r9, ASR #16		; r9 = OC_C7S1*x[1]>>16
   1.685 +	MUL	r7, r8, r7		; r7 = OC_C1S7*x[7]
   1.686 +	MOV	r14,r14,ASR #16		; r14= OC_C1S7*x[1]>>16
   1.687 +	MUL	r8, r5, r8		; r8 = OC_C7S1*x[7]
   1.688 +	LDRSH	r1, [r1, #-10]		; r1 = x[3]
   1.689 +	LDR	r5, OC_C3S5		; r5 = OC_C3S5
   1.690 +	LDR	r11,OC_C5S3		; r11= OC_C5S3
   1.691 +	ADD	r8, r14,r8, ASR #16	; r8=t[7]=C1S7*x[1]>>16+C7S1*x[7]>>16
   1.692 +	MUL	r14,r5, r10		; r14= OC_C3S5*x[5]
   1.693 +	SUB	r9, r9, r7, ASR #16	; r9=t[4]=C7S1*x[1]>>16-C1S7*x[7]>>16
   1.694 +	MUL	r10,r11,r10		; r10= OC_C5S3*x[5]
   1.695 +	MOV	r14,r14,ASR #16		; r14= OC_C3S5*x[5]>>16
   1.696 +	MUL	r11,r1, r11		; r11= OC_C5S3*x[3]
   1.697 +	MOV	r10,r10,ASR #16		; r10= OC_C5S3*x[5]>>16
   1.698 +	MUL	r1, r5, r1		; r1 = OC_C3S5*x[3]
   1.699 +	SUB	r14,r14,r11,ASR #16	;r14=t[5]=C3S5*x[5]>>16-C5S3*x[3]>>16
   1.700 +	ADD	r10,r10,r1, ASR #16	;r10=t[6]=C5S3*x[5]>>16+C3S5*x[3]>>16
   1.701 +	; r2=t[0]<<16 r3=t[2] r4=t[3] r6=t[1]<<16 r8=t[7] r9=t[4]
   1.702 +	; r10=t[6] r12=C4S4 r14=t[5]
   1.703 +	; Stage 2
   1.704 +; TODO: This is wrong; t[4]-t[5] and t[7]-t[6] need to be truncated to 16-bit
   1.705 +; before multiplying, not after (this is not equivalent)
   1.706 +	; 4-5 butterfly
   1.707 +	ADD	r9, r9, r14		; r9 = t2[4]     =       t[4]+t[5]
   1.708 +	SUB	r14,r9, r14, LSL #1	; r14=                   t[4]-t[5]
   1.709 +	MUL	r14,r12,r14		; r14= t2[5]<<16 = C4S4*(t[4]-t[5])
   1.710 +	; 7-6 butterfly
   1.711 +	ADD	r8, r8, r10		; r8 = t2[7]     =       t[7]+t[6]
   1.712 +	SUB	r10,r8, r10, LSL #1	; r10=                   t[7]-t[6]
   1.713 +	MUL	r10,r12,r10		; r10= t2[6]<<16 = C4S4*(t[7]+t[6])
   1.714 +	; r2=t[0]<<16 r3=t[2] r4=t[3] r6=t[1]<<16 r8=t2[7] r9=t2[4]
   1.715 +	; r10=t2[6]<<16 r12=C4S4 r14=t2[5]<<16
   1.716 +	; Stage 3
   1.717 +	ADD	r2, r2, #8<<16		; r2 = t[0]+8<<16
   1.718 +	ADD	r6, r6, #8<<16		; r6 = t[1]+8<<16
   1.719 +	; 0-3 butterfly
   1.720 +	ADD	r2, r4, r2, ASR #16	; r2 = t2[0] = t[0] + t[3] + 8
   1.721 +	SUB	r4, r2, r4, LSL #1	; r4 = t2[3] = t[0] - t[3] + 8
   1.722 +	; 1-2 butterfly
   1.723 +	ADD	r6, r3, r6, ASR #16	; r6 = t2[1] = t[1] + t[2] + 8
   1.724 +	SUB	r3, r6, r3, LSL #1	; r3 = t2[2] = t[1] - t[2] + 8
   1.725 +	; 6-5 butterfly
   1.726 +	MOV	r14,r14,ASR #16		; r14= t2[5]
   1.727 +	ADD	r10,r14,r10,ASR #16	; r10= t3[6] = t[6] + t[5]
   1.728 +	SUB	r14,r10,r14,LSL #1	; r14= t3[5] = t[6] - t[5]
   1.729 +	; r2=t2[0] r3=t2[2] r4=t2[3] r6=t2[1] r8=t2[7] r9=t2[4]
   1.730 +	; r10=t3[6] r14=t3[5]
   1.731 +	; Stage 4
   1.732 +	ADD	r2, r2, r8		; r2 = t[0] + t[7] + 8
   1.733 +	ADD	r6, r6, r10		; r6 = t[1] + t[6] + 8
   1.734 +	ADD	r3, r3, r14		; r3 = t[2] + t[5] + 8
   1.735 +	ADD	r4, r4, r9		; r4 = t[3] + t[4] + 8
   1.736 +	SUB	r8, r2, r8, LSL #1	; r8 = t[0] - t[7] + 8
   1.737 +	SUB	r10,r6, r10,LSL #1	; r10= t[1] - t[6] + 8
   1.738 +	SUB	r14,r3, r14,LSL #1	; r14= t[2] - t[5] + 8
   1.739 +	SUB	r9, r4, r9, LSL #1	; r9 = t[3] - t[4] + 8
   1.740 +	; TODO: This is wrong.
   1.741 +	; The C code truncates to 16 bits by storing to RAM and doing the
   1.742 +	;  shifts later; we've got an extra 4 bits here.
   1.743 +	MOV	r2, r2, ASR #4
   1.744 +	MOV	r6, r6, ASR #4
   1.745 +	MOV	r3, r3, ASR #4
   1.746 +	MOV	r4, r4, ASR #4
   1.747 +	MOV	r8, r8, ASR #4
   1.748 +	MOV	r10,r10,ASR #4
   1.749 +	MOV	r14,r14,ASR #4
   1.750 +	MOV	r9, r9, ASR #4
   1.751 +	STRH	r2, [r0], #2		; y[0] = t[0]+t[7]
   1.752 +	STRH	r6, [r0, #14]		; y[1] = t[1]+t[6]
   1.753 +	STRH	r3, [r0, #30]		; y[2] = t[2]+t[5]
   1.754 +	STRH	r4, [r0, #46]		; y[3] = t[3]+t[4]
   1.755 +	STRH	r9, [r0, #62]		; y[4] = t[3]-t[4]
   1.756 +	STRH	r14,[r0, #78]		; y[5] = t[2]-t[5]
   1.757 +	STRH	r10,[r0, #94]		; y[6] = t[1]-t[6]
   1.758 +	STRH	r8, [r0, #110]		; y[7] = t[0]-t[7]
   1.759 +	LDMFD	r13!,{r1,PC}
   1.760 +	ENDP
   1.761 +
   1.762 + [ OC_ARM_ASM_MEDIA
   1.763 +	EXPORT	oc_idct8x8_1_v6
   1.764 +	EXPORT	oc_idct8x8_v6
   1.765 +
   1.766 +oc_idct8x8_1_v6 PROC
   1.767 +	; r0 = ogg_int16_t  *_y
   1.768 +	; r1 = ogg_uint16_t  _dc
   1.769 +	ORR	r2, r1, r1, LSL #16
   1.770 +	ORR	r3, r1, r1, LSL #16
   1.771 +	STRD	r2, [r0], #8
   1.772 +	STRD	r2, [r0], #8
   1.773 +	STRD	r2, [r0], #8
   1.774 +	STRD	r2, [r0], #8
   1.775 +	STRD	r2, [r0], #8
   1.776 +	STRD	r2, [r0], #8
   1.777 +	STRD	r2, [r0], #8
   1.778 +	STRD	r2, [r0], #8
   1.779 +	STRD	r2, [r0], #8
   1.780 +	STRD	r2, [r0], #8
   1.781 +	STRD	r2, [r0], #8
   1.782 +	STRD	r2, [r0], #8
   1.783 +	STRD	r2, [r0], #8
   1.784 +	STRD	r2, [r0], #8
   1.785 +	STRD	r2, [r0], #8
   1.786 +	STRD	r2, [r0], #8
   1.787 +	MOV	PC, r14
   1.788 +	ENDP
   1.789 +
   1.790 +oc_idct8x8_v6 PROC
   1.791 +	; r0 = ogg_int16_t *_y
   1.792 +	; r1 = ogg_int16_t *_x
   1.793 +	; r2 = int          _last_zzi
   1.794 +	CMP	r2, #3
   1.795 +	BLE	oc_idct8x8_3_v6
   1.796 +	;CMP	r2, #6
   1.797 +	;BLE	oc_idct8x8_6_v6
   1.798 +	CMP	r2, #10
   1.799 +	BLE	oc_idct8x8_10_v6
   1.800 +oc_idct8x8_slow_v6
   1.801 +	STMFD	r13!,{r4-r11,r14}
   1.802 +	SUB	r13,r13,#64*2
   1.803 +; Row transforms
   1.804 +	STR	r0, [r13,#-4]!
   1.805 +	ADD	r0, r13, #4	; Write to temp storage.
   1.806 +	BL	idct8_8core_v6
   1.807 +	BL	idct8_8core_v6
   1.808 +	BL	idct8_8core_v6
   1.809 +	BL	idct8_8core_v6
   1.810 +	LDR	r0, [r13], #4	; Write to the final destination.
   1.811 +	; Clear input data for next block (decoder only).
   1.812 +	SUB	r2, r1, #8*16
   1.813 +	CMP	r0, r2
   1.814 +	MOV	r1, r13		; And read from temp storage.
   1.815 +	BEQ	oc_idct8x8_slow_v6_cols
   1.816 +	MOV	r4, #0
   1.817 +	MOV	r5, #0
   1.818 +	STRD	r4, [r2], #8
   1.819 +	STRD	r4, [r2], #8
   1.820 +	STRD	r4, [r2], #8
   1.821 +	STRD	r4, [r2], #8
   1.822 +	STRD	r4, [r2], #8
   1.823 +	STRD	r4, [r2], #8
   1.824 +	STRD	r4, [r2], #8
   1.825 +	STRD	r4, [r2], #8
   1.826 +	STRD	r4, [r2], #8
   1.827 +	STRD	r4, [r2], #8
   1.828 +	STRD	r4, [r2], #8
   1.829 +	STRD	r4, [r2], #8
   1.830 +	STRD	r4, [r2], #8
   1.831 +	STRD	r4, [r2], #8
   1.832 +	STRD	r4, [r2], #8
   1.833 +	STRD	r4, [r2], #8
   1.834 +oc_idct8x8_slow_v6_cols
   1.835 +; Column transforms
   1.836 +	BL	idct8_8core_down_v6
   1.837 +	BL	idct8_8core_down_v6
   1.838 +	BL	idct8_8core_down_v6
   1.839 +	BL	idct8_8core_down_v6
   1.840 +	ADD	r13,r13,#64*2
   1.841 +	LDMFD	r13!,{r4-r11,PC}
   1.842 +	ENDP
   1.843 +
   1.844 +oc_idct8x8_10_v6 PROC
   1.845 +	STMFD	r13!,{r4-r11,r14}
   1.846 +	SUB	r13,r13,#64*2+4
   1.847 +; Row transforms
   1.848 +	MOV	r2, r13
   1.849 +	STR	r0, [r13,#-4]!
   1.850 +	AND	r0, r2, #4	; Align the stack.
   1.851 +	ADD	r0, r0, r2	; Write to temp storage.
   1.852 +	BL	idct4_3core_v6
   1.853 +	BL	idct2_1core_v6
   1.854 +	LDR	r0, [r13], #4	; Write to the final destination.
   1.855 +	; Clear input data for next block (decoder only).
   1.856 +	SUB	r2, r1, #4*16
   1.857 +	CMP	r0, r2
   1.858 +	AND	r1, r13,#4	; Align the stack.
   1.859 +	BEQ	oc_idct8x8_10_v6_cols
   1.860 +	MOV	r4, #0
   1.861 +	MOV	r5, #0
   1.862 +	STRD	r4, [r2]
   1.863 +	STRD	r4, [r2,#16]
   1.864 +	STR	r4, [r2,#32]
   1.865 +	STR	r4, [r2,#48]
   1.866 +oc_idct8x8_10_v6_cols
   1.867 +; Column transforms
   1.868 +	ADD	r1, r1, r13	; And read from temp storage.
   1.869 +	BL	idct4_4core_down_v6
   1.870 +	BL	idct4_4core_down_v6
   1.871 +	BL	idct4_4core_down_v6
   1.872 +	BL	idct4_4core_down_v6
   1.873 +	ADD	r13,r13,#64*2+4
   1.874 +	LDMFD	r13!,{r4-r11,PC}
   1.875 +	ENDP
   1.876 +
   1.877 +oc_idct8x8_3_v6 PROC
   1.878 +	STMFD	r13!,{r4-r8,r14}
   1.879 +	SUB	r13,r13,#64*2
   1.880 +; Row transforms
   1.881 +	MOV	r8, r0
   1.882 +	MOV	r0, r13		; Write to temp storage.
   1.883 +	BL	idct2_1core_v6
   1.884 +	; Clear input data for next block (decoder only).
   1.885 +	SUB	r0, r1, #2*16
   1.886 +	CMP	r0, r8
   1.887 +	MOV	r1, r13		; Read from temp storage.
   1.888 +	MOVNE	r4, #0
   1.889 +	STRNE	r4, [r0]
   1.890 +	STRNE	r4, [r0,#16]
   1.891 +	MOVNE	r0, r8		; Write to the final destination.
   1.892 +; Column transforms
   1.893 +	BL	idct2_2core_down_v6
   1.894 +	BL	idct2_2core_down_v6
   1.895 +	BL	idct2_2core_down_v6
   1.896 +	BL	idct2_2core_down_v6
   1.897 +	ADD	r13,r13,#64*2
   1.898 +	LDMFD	r13!,{r4-r8,PC}
   1.899 +	ENDP
   1.900 +
   1.901 +idct2_1core_v6 PROC
   1.902 +	; r0 =       ogg_int16_t *_y (destination)
   1.903 +	; r1 = const ogg_int16_t *_x (source)
   1.904 +; Stage 1:
   1.905 +	LDR	r2, [r1], #16		; r2 = <x[0,1]|x[0,0]>
   1.906 +	LDR	r3, OC_C4S4
   1.907 +	LDRSH	r6, [r1], #16		; r6 = x[1,0]
   1.908 +	SMULWB	r12,r3, r2		; r12= t[0,0]=OC_C4S4*x[0,0]>>16
   1.909 +	LDRD	r4, OC_C7S1		; r4 = OC_C7S1; r5 = OC_C1S7
   1.910 +	SMULWB	r6, r3, r6		; r6 = t[1,0]=OC_C4S4*x[1,0]>>16
   1.911 +	SMULWT	r4, r4, r2		; r4 = t[0,4]=OC_C7S1*x[0,1]>>16
   1.912 +	SMULWT	r7, r5, r2		; r7 = t[0,7]=OC_C1S7*x[0,1]>>16
   1.913 +; Stage 2:
   1.914 +	SMULWB	r5, r3, r4		; r5 = t[0,5]=OC_C4S4*t[0,4]>>16
   1.915 +	PKHBT	r12,r12,r6, LSL #16	; r12= <t[1,0]|t[0,0]>
   1.916 +	SMULWB	r6, r3, r7		; r6 = t[0,6]=OC_C4S4*t[0,7]>>16
   1.917 +	PKHBT	r7, r7, r3		; r7 = <0|t[0,7]>
   1.918 +; Stage 3:
   1.919 +	PKHBT	r5, r6, r5, LSL #16	; r5 = <t[0,5]|t[0,6]>
   1.920 +	PKHBT	r4, r4, r3		; r4 = <0|t[0,4]>
   1.921 +	SASX	r5, r5, r5		; r5 = <t[0,6]+t[0,5]|t[0,6]-t[0,5]>
   1.922 +; Stage 4:
   1.923 +	PKHTB	r6, r3, r5, ASR #16	; r6 = <0|t[0,6]>
   1.924 +	PKHBT	r5, r5, r3		; r5 = <0|t[0,5]>
   1.925 +	SADD16	r3, r12,r7		; r3 = t[0]+t[7]
   1.926 +	STR	r3, [r0], #4		; y[0<<3] = t[0]+t[7]
   1.927 +	SADD16	r3, r12,r6		; r3 = t[0]+t[6]
   1.928 +	STR	r3, [r0, #12]		; y[1<<3] = t[0]+t[6]
   1.929 +	SADD16	r3, r12,r5		; r3 = t[0]+t[5]
   1.930 +	STR	r3, [r0, #28]		; y[2<<3] = t[0]+t[5]
   1.931 +	SADD16	r3, r12,r4		; r3 = t[0]+t[4]
   1.932 +	STR	r3, [r0, #44]		; y[3<<3] = t[0]+t[4]
   1.933 +	SSUB16	r4, r12,r4		; r4 = t[0]-t[4]
   1.934 +	STR	r4, [r0, #60]		; y[4<<3] = t[0]-t[4]
   1.935 +	SSUB16	r5, r12,r5		; r5 = t[0]-t[5]
   1.936 +	STR	r5, [r0, #76]		; y[5<<3] = t[0]-t[5]
   1.937 +	SSUB16	r6, r12,r6		; r6 = t[0]-t[6]
   1.938 +	STR	r6, [r0, #92]		; y[6<<3] = t[0]-t[6]
   1.939 +	SSUB16	r7, r12,r7		; r7 = t[0]-t[7]
   1.940 +	STR	r7, [r0, #108]		; y[7<<3] = t[0]-t[7]
   1.941 +	MOV	PC,r14
   1.942 +	ENDP
   1.943 + ]
   1.944 +
   1.945 +	ALIGN 8
   1.946 +OC_C7S1
   1.947 +	DCD	12785 ; 31F1
   1.948 +OC_C1S7
   1.949 +	DCD	64277 ; FB15
   1.950 +OC_C6S2
   1.951 +	DCD	25080 ; 61F8
   1.952 +OC_C2S6
   1.953 +	DCD	60547 ; EC83
   1.954 +OC_C5S3
   1.955 +	DCD	36410 ; 8E3A
   1.956 +OC_C3S5
   1.957 +	DCD	54491 ; D4DB
   1.958 +OC_C4S4
   1.959 +	DCD	46341 ; B505
   1.960 +
   1.961 + [ OC_ARM_ASM_MEDIA
   1.962 +idct2_2core_down_v6 PROC
   1.963 +	; r0 =       ogg_int16_t *_y (destination)
   1.964 +	; r1 = const ogg_int16_t *_x (source)
   1.965 +; Stage 1:
   1.966 +	LDR	r2, [r1], #16		; r2 = <x[0,1]|x[0,0]>
   1.967 +	LDR	r3, OC_C4S4
   1.968 +	MOV	r7 ,#8			; r7  = 8
   1.969 +	LDR	r6, [r1], #16		; r6 = <x[1,1]|x[1,0]>
   1.970 +	SMLAWB	r12,r3, r2, r7		; r12= (t[0,0]=OC_C4S4*x[0,0]>>16)+8
   1.971 +	LDRD	r4, OC_C7S1		; r4 = OC_C7S1; r5 = OC_C1S7
   1.972 +	SMLAWB	r7, r3, r6, r7		; r7 = (t[1,0]=OC_C4S4*x[1,0]>>16)+8
   1.973 +	SMULWT  r5, r5, r2		; r2 = t[0,7]=OC_C1S7*x[0,1]>>16
   1.974 +	PKHBT	r12,r12,r7, LSL #16	; r12= <t[1,0]+8|t[0,0]+8>
   1.975 +	SMULWT	r4, r4, r2		; r4 = t[0,4]=OC_C7S1*x[0,1]>>16
   1.976 +; Here we cheat: row 1 had just a DC, so x[0,1]==x[1,1] by definition.
   1.977 +	PKHBT	r7, r5, r5, LSL #16	; r7 = <t[0,7]|t[0,7]>
   1.978 +; Stage 2:
   1.979 +	SMULWB	r6, r3, r7		; r6 = t[0,6]=OC_C4S4*t[0,7]>>16
   1.980 +	PKHBT	r4, r4, r4, LSL #16	; r4 = <t[0,4]|t[0,4]>
   1.981 +	SMULWT	r2, r3, r7		; r2 = t[1,6]=OC_C4S4*t[1,7]>>16
   1.982 +	SMULWB	r5, r3, r4		; r5 = t[0,5]=OC_C4S4*t[0,4]>>16
   1.983 +	PKHBT	r6, r6, r2, LSL #16	; r6 = <t[1,6]|t[0,6]>
   1.984 +	SMULWT	r2, r3, r4		; r2 = t[1,5]=OC_C4S4*t[1,4]>>16
   1.985 +	PKHBT	r2, r5, r2, LSL #16	; r2 = <t[1,5]|t[0,5]>
   1.986 +; Stage 3:
   1.987 +	SSUB16	r5, r6, r2		; r5 = <t[1,6]-t[1,5]|t[0,6]-t[0,5]>
   1.988 +	SADD16	r6, r6, r2		; r6 = <t[1,6]+t[1,5]|t[0,6]+t[0,5]>
   1.989 +; Stage 4:
   1.990 +	SADD16	r2, r12,r7		; r2 = t[0]+t[7]+8
   1.991 +	MOV	r3, r2, ASR #4
   1.992 +	MOV	r2, r2, LSL #16
   1.993 +	PKHTB	r3, r3, r2, ASR #20	; r3 = t[0]+t[7]+8>>4
   1.994 +	STR	r3, [r0], #4		; y[0<<3] = t[0]+t[7]+8>>4
   1.995 +	SADD16	r2, r12,r6		; r2 = t[0]+t[6]+8
   1.996 +	MOV	r3, r2, ASR #4
   1.997 +	MOV	r2, r2, LSL #16
   1.998 +	PKHTB	r3, r3, r2, ASR #20	; r3 = t[0]+t[6]+8>>4
   1.999 +	STR	r3, [r0, #12]		; y[1<<3] = t[0]+t[6]+8>>4
  1.1000 +	SADD16	r2, r12,r5		; r2 = t[0]+t[5]+8
  1.1001 +	MOV	r3, r2, ASR #4
  1.1002 +	MOV	r2, r2, LSL #16
  1.1003 +	PKHTB	r3, r3, r2, ASR #20	; r3 = t[0]+t[5]+8>>4
  1.1004 +	STR	r3, [r0, #28]		; y[2<<3] = t[0]+t[5]+8>>4
  1.1005 +	SADD16	r2, r12,r4		; r2 = t[0]+t[4]+8
  1.1006 +	MOV	r3, r2, ASR #4
  1.1007 +	MOV	r2, r2, LSL #16
  1.1008 +	PKHTB	r3, r3, r2, ASR #20	; r3 = t[0]+t[4]+8>>4
  1.1009 +	STR	r3, [r0, #44]		; y[3<<3] = t[0]+t[4]+8>>4
  1.1010 +	SSUB16	r4, r12,r4		; r4 = t[0]-t[4]+8
  1.1011 +	MOV	r3, r4, ASR #4
  1.1012 +	MOV	r4, r4, LSL #16
  1.1013 +	PKHTB	r3, r3, r4, ASR #20	; r3 = t[0]-t[4]+8>>4
  1.1014 +	STR	r3, [r0, #60]		; y[4<<3] = t[0]-t[4]+8>>4
  1.1015 +	SSUB16	r5, r12,r5		; r5 = t[0]-t[5]+8
  1.1016 +	MOV	r3, r5, ASR #4
  1.1017 +	MOV	r5, r5, LSL #16
  1.1018 +	PKHTB	r3, r3, r5, ASR #20	; r3 = t[0]-t[5]+8>>4
  1.1019 +	STR	r3, [r0, #76]		; y[5<<3] = t[0]-t[5]+8>>4
  1.1020 +	SSUB16	r6, r12,r6		; r6 = t[0]-t[6]+8
  1.1021 +	MOV	r3, r6, ASR #4
  1.1022 +	MOV	r6, r6, LSL #16
  1.1023 +	PKHTB	r3, r3, r6, ASR #20	; r3 = t[0]-t[6]+8>>4
  1.1024 +	STR	r3, [r0, #92]		; y[6<<3] = t[0]-t[6]+8>>4
  1.1025 +	SSUB16	r7, r12,r7		; r7 = t[0]-t[7]+8
  1.1026 +	MOV	r3, r7, ASR #4
  1.1027 +	MOV	r7, r7, LSL #16
  1.1028 +	PKHTB	r3, r3, r7, ASR #20	; r3 = t[0]-t[7]+8>>4
  1.1029 +	STR	r3, [r0, #108]		; y[7<<3] = t[0]-t[7]+8>>4
  1.1030 +	MOV	PC,r14
  1.1031 +	ENDP
  1.1032 +
  1.1033 +; In theory this should save ~75 cycles over oc_idct8x8_10, more than enough to
  1.1034 +;  pay for increased branch mis-prediction to get here, but in practice it
  1.1035 +;  doesn't seem to slow anything down to take it out, and it's less code this
  1.1036 +;  way.
  1.1037 + [ 0
  1.1038 +oc_idct8x8_6_v6 PROC
  1.1039 +	STMFD	r13!,{r4-r8,r10,r11,r14}
  1.1040 +	SUB	r13,r13,#64*2+4
  1.1041 +; Row transforms
  1.1042 +	MOV	r8, r0
  1.1043 +	AND	r0, r13,#4	; Align the stack.
  1.1044 +	ADD	r0, r0, r13	; Write to temp storage.
  1.1045 +	BL	idct3_2core_v6
  1.1046 +	BL	idct1core_v6
  1.1047 +	; Clear input data for next block (decoder only).
  1.1048 +	SUB	r0, r1, #3*16
  1.1049 +	CMP	r0, r8
  1.1050 +	AND	r1, r13,#4	; Align the stack.
  1.1051 +	BEQ	oc_idct8x8_6_v6_cols
  1.1052 +	MOV	r4, #0
  1.1053 +	MOV	r5, #0
  1.1054 +	STRD	r4, [r0]
  1.1055 +	STR	r4, [r0,#16]
  1.1056 +	STR	r4, [r0,#32]
  1.1057 +	MOV	r0, r8		; Write to the final destination.
  1.1058 +oc_idct8x8_6_v6_cols
  1.1059 +; Column transforms
  1.1060 +	ADD	r1, r1, r13	; And read from temp storage.
  1.1061 +	BL	idct3_3core_down_v6
  1.1062 +	BL	idct3_3core_down_v6
  1.1063 +	BL	idct3_3core_down_v6
  1.1064 +	BL	idct3_3core_down_v6
  1.1065 +	ADD	r13,r13,#64*2+4
  1.1066 +	LDMFD	r13!,{r4-r8,r10,r11,PC}
  1.1067 +	ENDP
  1.1068 +
  1.1069 +idct1core_v6 PROC
  1.1070 +	; r0 =       ogg_int16_t *_y (destination)
  1.1071 +	; r1 = const ogg_int16_t *_x (source)
  1.1072 +	LDRSH	r3, [r1], #16
  1.1073 +	MOV	r12,#0x05
  1.1074 +	ORR	r12,r12,#0xB500
  1.1075 +	MUL	r3, r12, r3
  1.1076 +	; Stall ?
  1.1077 +	MOV	r3, r3, ASR #16
  1.1078 +	; Don't need to actually store the odd lines; they won't be read.
  1.1079 +	STRH	r3, [r0], #2
  1.1080 +	STRH	r3, [r0, #30]
  1.1081 +	STRH	r3, [r0, #62]
  1.1082 +	STRH	r3, [r0, #94]
  1.1083 +	MOV	PC,R14
  1.1084 +	ENDP
  1.1085 +
  1.1086 +idct3_2core_v6 PROC
  1.1087 +	; r0 =       ogg_int16_t *_y (destination)
  1.1088 +	; r1 = const ogg_int16_t *_x (source)
  1.1089 +; Stage 1:
  1.1090 +	LDRD	r4, [r1], #16		; r4 = <x[0,1]|x[0,0]>; r5 = <*|x[0,2]>
  1.1091 +	LDRD	r10,OC_C6S2_3_v6	; r10= OC_C6S2; r11= OC_C2S6
  1.1092 +	; Stall
  1.1093 +	SMULWB	r3, r11,r5		; r3 = t[0,3]=OC_C2S6*x[0,2]>>16
  1.1094 +	LDR	r11,OC_C4S4
  1.1095 +	SMULWB	r2, r10,r5		; r2 = t[0,2]=OC_C6S2*x[0,2]>>16
  1.1096 +	LDR	r5, [r1], #16		; r5 = <x[1,1]|x[1,0]>
  1.1097 +	SMULWB	r12,r11,r4		; r12= (t[0,0]=OC_C4S4*x[0,0]>>16)
  1.1098 +	LDRD	r6, OC_C7S1_3_v6	; r6 = OC_C7S1; r7 = OC_C1S7
  1.1099 +	SMULWB	r10,r11,r5		; r10= (t[1,0]=OC_C4S4*x[1,0]>>16)
  1.1100 +	PKHBT	r12,r12,r10,LSL #16	; r12= <t[1,0]|t[0,0]>
  1.1101 +	SMULWT  r10,r7, r5		; r10= t[1,7]=OC_C1S7*x[1,1]>>16
  1.1102 +	PKHBT	r2, r2, r11		; r2 = <0|t[0,2]>
  1.1103 +	SMULWT  r7, r7, r4		; r7 = t[0,7]=OC_C1S7*x[0,1]>>16
  1.1104 +	PKHBT	r3, r3, r11		; r3 = <0|t[0,3]>
  1.1105 +	SMULWT	r5, r6, r5		; r10= t[1,4]=OC_C7S1*x[1,1]>>16
  1.1106 +	PKHBT	r7, r7, r10,LSL #16	; r7 = <t[1,7]|t[0,7]>
  1.1107 +	SMULWT	r4, r6, r4		; r4 = t[0,4]=OC_C7S1*x[0,1]>>16
  1.1108 +; Stage 2:
  1.1109 +	SMULWB	r6, r11,r7		; r6 = t[0,6]=OC_C4S4*t[0,7]>>16
  1.1110 +	PKHBT	r4, r4, r5, LSL #16	; r4 = <t[1,4]|t[0,4]>
  1.1111 +	SMULWT	r10,r11,r7		; r10= t[1,6]=OC_C4S4*t[1,7]>>16
  1.1112 +	SMULWB	r5, r11,r4		; r5 = t[0,5]=OC_C4S4*t[0,4]>>16
  1.1113 +	PKHBT	r6, r6, r10,LSL #16	; r6 = <t[1,6]|t[0,6]>
  1.1114 +	SMULWT	r10,r11,r4		; r10= t[1,5]=OC_C4S4*t[1,4]>>16
  1.1115 +; Stage 3:
  1.1116 +	B	idct4_3core_stage3_v6
  1.1117 +	ENDP
  1.1118 +
  1.1119 +; Another copy so the LDRD offsets are less than +/- 255.
  1.1120 +	ALIGN 8
  1.1121 +OC_C7S1_3_v6
  1.1122 +	DCD	12785 ; 31F1
  1.1123 +OC_C1S7_3_v6
  1.1124 +	DCD	64277 ; FB15
  1.1125 +OC_C6S2_3_v6
  1.1126 +	DCD	25080 ; 61F8
  1.1127 +OC_C2S6_3_v6
  1.1128 +	DCD	60547 ; EC83
  1.1129 +
  1.1130 +idct3_3core_down_v6 PROC
  1.1131 +	; r0 =       ogg_int16_t *_y (destination)
  1.1132 +	; r1 = const ogg_int16_t *_x (source)
  1.1133 +; Stage 1:
  1.1134 +	LDRD	r10,[r1], #16		; r10= <x[0,1]|x[0,0]>; r11= <??|x[0,2]>
  1.1135 +	LDRD	r6, OC_C6S2_3_v6	; r6 = OC_C6S2; r7 = OC_C2S6
  1.1136 +	LDR	r4, [r1], #16		; r4 = <x[1,1]|x[1,0]>
  1.1137 +	SMULWB	r3, r7, r11		; r3 = t[0,3]=OC_C2S6*x[0,2]>>16
  1.1138 +	MOV	r7,#8
  1.1139 +	SMULWB	r2, r6, r11		; r2 = t[0,2]=OC_C6S2*x[0,2]>>16
  1.1140 +	LDR	r11,OC_C4S4
  1.1141 +	SMLAWB	r12,r11,r10,r7		; r12= t[0,0]+8=(OC_C4S4*x[0,0]>>16)+8
  1.1142 +; Here we cheat: row 2 had just a DC, so x[0,2]==x[1,2] by definition.
  1.1143 +	PKHBT	r3, r3, r3, LSL #16	; r3 = <t[0,3]|t[0,3]>
  1.1144 +	SMLAWB	r5, r11,r4, r7		; r5 = t[1,0]+8=(OC_C4S4*x[1,0]>>16)+8
  1.1145 +	PKHBT	r2, r2, r2, LSL #16	; r2 = <t[0,2]|t[0,2]>
  1.1146 +	LDRD	r6, OC_C7S1_3_v6	; r6 = OC_C7S1; r7 = OC_C1S7
  1.1147 +	PKHBT	r12,r12,r5, LSL #16	; r12= <t[1,0]+8|t[0,0]+8>
  1.1148 +	SMULWT  r5, r7, r4		; r5 = t[1,7]=OC_C1S7*x[1,1]>>16
  1.1149 +	SMULWT  r7, r7, r10		; r7 = t[0,7]=OC_C1S7*x[0,1]>>16
  1.1150 +	SMULWT	r10,r6, r10		; r10= t[0,4]=OC_C7S1*x[0,1]>>16
  1.1151 +	PKHBT	r7, r7, r5, LSL #16	; r7 = <t[1,7]|t[0,7]>
  1.1152 +	SMULWT	r4, r6, r4		; r4 = t[1,4]=OC_C7S1*x[1,1]>>16
  1.1153 +; Stage 2:
  1.1154 +	SMULWB	r6, r11,r7		; r6 = t[0,6]=OC_C4S4*t[0,7]>>16
  1.1155 +	PKHBT	r4, r10,r4, LSL #16	; r4 = <t[1,4]|t[0,4]>
  1.1156 +	SMULWT	r10,r11,r7		; r10= t[1,6]=OC_C4S4*t[1,7]>>16
  1.1157 +	SMULWB	r5, r11,r4		; r5 = t[0,5]=OC_C4S4*t[0,4]>>16
  1.1158 +	PKHBT	r6, r6, r10,LSL #16	; r6 = <t[1,6]|t[0,6]>
  1.1159 +	SMULWT	r10,r11,r4		; r10= t[1,5]=OC_C4S4*t[1,4]>>16
  1.1160 +; Stage 3:
  1.1161 +	B	idct4_4core_down_stage3_v6
  1.1162 +	ENDP
  1.1163 + ]
  1.1164 +
  1.1165 +idct4_3core_v6 PROC
  1.1166 +	; r0 =       ogg_int16_t *_y (destination)
  1.1167 +	; r1 = const ogg_int16_t *_x (source)
  1.1168 +; Stage 1:
  1.1169 +	LDRD	r10,[r1], #16	; r10= <x[0,1]|x[0,0]>; r11= <x[0,3]|x[0,2]>
  1.1170 +	LDRD	r2, OC_C5S3_4_v6	; r2 = OC_C5S3; r3 = OC_C3S5
  1.1171 +	LDRD	r4, [r1], #16		; r4 = <x[1,1]|x[1,0]>; r5 = <??|x[1,2]>
  1.1172 +	SMULWT	r9, r3, r11		; r9 = t[0,6]=OC_C3S5*x[0,3]>>16
  1.1173 +	SMULWT	r8, r2, r11		; r8 = -t[0,5]=OC_C5S3*x[0,3]>>16
  1.1174 +	PKHBT	r9, r9, r2		; r9 = <0|t[0,6]>
  1.1175 +	LDRD	r6, OC_C6S2_4_v6	; r6 = OC_C6S2; r7 = OC_C2S6
  1.1176 +	PKHBT	r8, r8, r2		; r9 = <0|-t[0,5]>
  1.1177 +	SMULWB	r3, r7, r11		; r3 = t[0,3]=OC_C2S6*x[0,2]>>16
  1.1178 +	SMULWB	r2, r6, r11		; r2 = t[0,2]=OC_C6S2*x[0,2]>>16
  1.1179 +	LDR	r11,OC_C4S4
  1.1180 +	SMULWB	r12,r7, r5		; r12= t[1,3]=OC_C2S6*x[1,2]>>16
  1.1181 +	SMULWB	r5, r6, r5		; r5 = t[1,2]=OC_C6S2*x[1,2]>>16
  1.1182 +	PKHBT	r3, r3, r12,LSL #16	; r3 = <t[1,3]|t[0,3]>
  1.1183 +	SMULWB	r12,r11,r10		; r12= t[0,0]=OC_C4S4*x[0,0]>>16
  1.1184 +	PKHBT	r2, r2, r5, LSL #16	; r2 = <t[1,2]|t[0,2]>
  1.1185 +	SMULWB	r5, r11,r4		; r5 = t[1,0]=OC_C4S4*x[1,0]>>16
  1.1186 +	LDRD	r6, OC_C7S1_4_v6	; r6 = OC_C7S1; r7 = OC_C1S7
  1.1187 +	PKHBT	r12,r12,r5, LSL #16	; r12= <t[1,0]|t[0,0]>
  1.1188 +	SMULWT  r5, r7, r4		; r5 = t[1,7]=OC_C1S7*x[1,1]>>16
  1.1189 +	SMULWT  r7, r7, r10		; r7 = t[0,7]=OC_C1S7*x[0,1]>>16
  1.1190 +	SMULWT	r10,r6, r10		; r10= t[0,4]=OC_C7S1*x[0,1]>>16
  1.1191 +	PKHBT	r7, r7, r5, LSL #16	; r7 = <t[1,7]|t[0,7]>
  1.1192 +	SMULWT	r4, r6, r4		; r4 = t[1,4]=OC_C7S1*x[1,1]>>16
  1.1193 +; Stage 2:
  1.1194 +	SSUB16	r6, r7, r9		; r6 = t[7]-t[6]
  1.1195 +	PKHBT	r4, r10,r4, LSL #16	; r4 = <t[1,4]|t[0,4]>
  1.1196 +	SADD16	r7, r7, r9		; r7 = t[7]=t[7]+t[6]
  1.1197 +	SMULWT	r9, r11,r6		; r9 = t[1,6]=OC_C4S4*r6T>>16
  1.1198 +	SADD16	r5, r4, r8		; r5 = t[4]-t[5]
  1.1199 +	SMULWB	r6, r11,r6		; r6 = t[0,6]=OC_C4S4*r6B>>16
  1.1200 +	SSUB16	r4, r4, r8		; r4 = t[4]=t[4]+t[5]
  1.1201 +	SMULWT	r10,r11,r5		; r10= t[1,5]=OC_C4S4*r5T>>16
  1.1202 +	PKHBT	r6, r6, r9, LSL #16	; r6 = <t[1,6]|t[0,6]>
  1.1203 +	SMULWB	r5, r11,r5		; r5 = t[0,5]=OC_C4S4*r5B>>16
  1.1204 +; Stage 3:
  1.1205 +idct4_3core_stage3_v6
  1.1206 +	SADD16	r11,r12,r2		; r11= t[1]=t[0]+t[2]
  1.1207 +	PKHBT	r10,r5, r10,LSL #16	; r10= <t[1,5]|t[0,5]>
  1.1208 +	SSUB16	r2, r12,r2		; r2 = t[2]=t[0]-t[2]
  1.1209 +idct4_3core_stage3_5_v6
  1.1210 +	SSUB16	r5, r6, r10		; r5 = t[5]'=t[6]-t[5]
  1.1211 +	SADD16	r6, r6, r10		; r6 = t[6]=t[6]+t[5]
  1.1212 +	SADD16	r10,r12,r3		; r10= t[0]'=t[0]+t[3]
  1.1213 +	SSUB16	r3, r12,r3		; r3 = t[3]=t[0]-t[3]
  1.1214 +; Stage 4:
  1.1215 +	SADD16	r12,r10,r7		; r12= t[0]+t[7]
  1.1216 +	STR	r12,[r0], #4		; y[0<<3] = t[0]+t[7]
  1.1217 +	SADD16	r12,r11,r6		; r12= t[1]+t[6]
  1.1218 +	STR	r12,[r0, #12]		; y[1<<3] = t[1]+t[6]
  1.1219 +	SADD16	r12,r2, r5		; r12= t[2]+t[5]
  1.1220 +	STR	r12,[r0, #28]		; y[2<<3] = t[2]+t[5]
  1.1221 +	SADD16	r12,r3, r4		; r12= t[3]+t[4]
  1.1222 +	STR	r12,[r0, #44]		; y[3<<3] = t[3]+t[4]
  1.1223 +	SSUB16	r4, r3, r4		; r4 = t[3]-t[4]
  1.1224 +	STR	r4, [r0, #60]		; y[4<<3] = t[3]-t[4]
  1.1225 +	SSUB16	r5, r2, r5		; r5 = t[2]-t[5]
  1.1226 +	STR	r5, [r0, #76]		; y[5<<3] = t[2]-t[5]
  1.1227 +	SSUB16	r6, r11,r6		; r6 = t[1]-t[6]
  1.1228 +	STR	r6, [r0, #92]		; y[6<<3] = t[1]-t[6]
  1.1229 +	SSUB16	r7, r10,r7		; r7 = t[0]-t[7]
  1.1230 +	STR	r7, [r0, #108]		; y[7<<3] = t[0]-t[7]
  1.1231 +	MOV	PC,r14
  1.1232 +	ENDP
  1.1233 +
  1.1234 +; Another copy so the LDRD offsets are less than +/- 255.
  1.1235 +	ALIGN 8
  1.1236 +OC_C7S1_4_v6
  1.1237 +	DCD	12785 ; 31F1
  1.1238 +OC_C1S7_4_v6
  1.1239 +	DCD	64277 ; FB15
  1.1240 +OC_C6S2_4_v6
  1.1241 +	DCD	25080 ; 61F8
  1.1242 +OC_C2S6_4_v6
  1.1243 +	DCD	60547 ; EC83
  1.1244 +OC_C5S3_4_v6
  1.1245 +	DCD	36410 ; 8E3A
  1.1246 +OC_C3S5_4_v6
  1.1247 +	DCD	54491 ; D4DB
  1.1248 +
  1.1249 +idct4_4core_down_v6 PROC
  1.1250 +	; r0 =       ogg_int16_t *_y (destination)
  1.1251 +	; r1 = const ogg_int16_t *_x (source)
  1.1252 +; Stage 1:
  1.1253 +	LDRD	r10,[r1], #16	; r10= <x[0,1]|x[0,0]>; r11= <x[0,3]|x[0,2]>
  1.1254 +	LDRD	r2, OC_C5S3_4_v6	; r2 = OC_C5S3; r3 = OC_C3S5
  1.1255 +	LDRD	r4, [r1], #16	; r4 = <x[1,1]|x[1,0]>; r5 = <x[1,3]|x[1,2]>
  1.1256 +	SMULWT	r9, r3, r11		; r9 = t[0,6]=OC_C3S5*x[0,3]>>16
  1.1257 +	LDRD	r6, OC_C6S2_4_v6	; r6 = OC_C6S2; r7 = OC_C2S6
  1.1258 +	SMULWT	r8, r2, r11		; r8 = -t[0,5]=OC_C5S3*x[0,3]>>16
  1.1259 +; Here we cheat: row 3 had just a DC, so x[0,3]==x[1,3] by definition.
  1.1260 +	PKHBT	r9, r9, r9, LSL #16	; r9 = <t[0,6]|t[0,6]>
  1.1261 +	SMULWB	r3, r7, r11		; r3 = t[0,3]=OC_C2S6*x[0,2]>>16
  1.1262 +	PKHBT	r8, r8, r8, LSL #16	; r8 = <-t[0,5]|-t[0,5]>
  1.1263 +	SMULWB	r2, r6, r11		; r2 = t[0,2]=OC_C6S2*x[0,2]>>16
  1.1264 +	LDR	r11,OC_C4S4
  1.1265 +	SMULWB	r12,r7, r5		; r12= t[1,3]=OC_C2S6*x[1,2]>>16
  1.1266 +	MOV	r7,#8
  1.1267 +	SMULWB	r5, r6, r5		; r5 = t[1,2]=OC_C6S2*x[1,2]>>16
  1.1268 +	PKHBT	r3, r3, r12,LSL #16	; r3 = <t[1,3]|t[0,3]>
  1.1269 +	SMLAWB	r12,r11,r10,r7		; r12= t[0,0]+8=(OC_C4S4*x[0,0]>>16)+8
  1.1270 +	PKHBT	r2, r2, r5, LSL #16	; r2 = <t[1,2]|t[0,2]>
  1.1271 +	SMLAWB	r5, r11,r4 ,r7		; r5 = t[1,0]+8=(OC_C4S4*x[1,0]>>16)+8
  1.1272 +	LDRD	r6, OC_C7S1_4_v6	; r6 = OC_C7S1; r7 = OC_C1S7
  1.1273 +	PKHBT	r12,r12,r5, LSL #16	; r12= <t[1,0]+8|t[0,0]+8>
  1.1274 +	SMULWT  r5, r7, r4		; r5 = t[1,7]=OC_C1S7*x[1,1]>>16
  1.1275 +	SMULWT  r7, r7, r10		; r7 = t[0,7]=OC_C1S7*x[0,1]>>16
  1.1276 +	SMULWT	r10,r6, r10		; r10= t[0,4]=OC_C7S1*x[0,1]>>16
  1.1277 +	PKHBT	r7, r7, r5, LSL #16	; r7 = <t[1,7]|t[0,7]>
  1.1278 +	SMULWT	r4, r6, r4		; r4 = t[1,4]=OC_C7S1*x[1,1]>>16
  1.1279 +; Stage 2:
  1.1280 +	SSUB16	r6, r7, r9		; r6 = t[7]-t[6]
  1.1281 +	PKHBT	r4, r10,r4, LSL #16	; r4 = <t[1,4]|t[0,4]>
  1.1282 +	SADD16	r7, r7, r9		; r7 = t[7]=t[7]+t[6]
  1.1283 +	SMULWT	r9, r11,r6		; r9 = t[1,6]=OC_C4S4*r6T>>16
  1.1284 +	SADD16	r5, r4, r8		; r5 = t[4]-t[5]
  1.1285 +	SMULWB	r6, r11,r6		; r6 = t[0,6]=OC_C4S4*r6B>>16
  1.1286 +	SSUB16	r4, r4, r8		; r4 = t[4]=t[4]+t[5]
  1.1287 +	SMULWT	r10,r11,r5		; r10= t[1,5]=OC_C4S4*r5T>>16
  1.1288 +	PKHBT	r6, r6, r9, LSL #16	; r6 = <t[1,6]|t[0,6]>
  1.1289 +	SMULWB	r5, r11,r5		; r5 = t[0,5]=OC_C4S4*r5B>>16
  1.1290 +; Stage 3:
  1.1291 +idct4_4core_down_stage3_v6
  1.1292 +	SADD16	r11,r12,r2		; r11= t[1]+8=t[0]+t[2]+8
  1.1293 +	PKHBT	r10,r5, r10,LSL #16	; r10= <t[1,5]|t[0,5]>
  1.1294 +	SSUB16	r2, r12,r2		; r2 = t[2]+8=t[0]-t[2]+8
  1.1295 +	B	idct8_8core_down_stage3_5_v6
  1.1296 +	ENDP
  1.1297 +
  1.1298 +idct8_8core_v6 PROC
  1.1299 +	STMFD	r13!,{r0,r14}
  1.1300 +; Stage 1:
  1.1301 +	;5-6 rotation by 3pi/16
  1.1302 +	LDRD	r10,OC_C5S3_4_v6	; r10= OC_C5S3, r11= OC_C3S5
  1.1303 +	LDR	r4, [r1,#8]		; r4 = <x[0,5]|x[0,4]>
  1.1304 +	LDR	r7, [r1,#24]		; r7 = <x[1,5]|x[1,4]>
  1.1305 +	SMULWT	r5, r11,r4		; r5 = OC_C3S5*x[0,5]>>16
  1.1306 +	LDR	r0, [r1,#4]		; r0 = <x[0,3]|x[0,2]>
  1.1307 +	SMULWT	r3, r11,r7		; r3 = OC_C3S5*x[1,5]>>16
  1.1308 +	LDR	r12,[r1,#20]		; r12= <x[1,3]|x[1,2]>
  1.1309 +	SMULWT	r6, r11,r0		; r6 = OC_C3S5*x[0,3]>>16
  1.1310 +	SMULWT	r11,r11,r12		; r11= OC_C3S5*x[1,3]>>16
  1.1311 +	SMLAWT	r6, r10,r4, r6		; r6 = t[0,6]=r6+(OC_C5S3*x[0,5]>>16)
  1.1312 +	PKHBT	r5, r5, r3, LSL #16	; r5 = <r3|r5>
  1.1313 +	SMLAWT	r11,r10,r7, r11		; r11= t[1,6]=r11+(OC_C5S3*x[1,5]>>16)
  1.1314 +	PKHBT	r4, r4, r7, LSL #16	; r4 = <x[1,4]|x[0,4]>
  1.1315 +	SMULWT	r3, r10,r0		; r3 = OC_C5S3*x[0,3]>>16
  1.1316 +	PKHBT	r6, r6, r11,LSL #16	; r6 = <t[1,6]|t[0,6]>
  1.1317 +	SMULWT	r8, r10,r12		; r8 = OC_C5S3*x[1,3]>>16
  1.1318 +	;2-3 rotation by 6pi/16
  1.1319 +	LDRD	r10,OC_C6S2_4_v6	; r10= OC_C6S2, r11= OC_C2S6
  1.1320 +	PKHBT	r3, r3, r8, LSL #16	; r3 = <r8|r3>
  1.1321 +	LDR	r8, [r1,#12]		; r8 = <x[0,7]|x[0,6]>
  1.1322 +	SMULWB	r2, r10,r0		; r2 = OC_C6S2*x[0,2]>>16
  1.1323 +	SSUB16	r5, r5, r3		; r5 = <t[1,5]|t[0,5]>
  1.1324 +	SMULWB	r9, r10,r12		; r9 = OC_C6S2*x[1,2]>>16
  1.1325 +	LDR	r7, [r1,#28]		; r7 = <x[1,7]|x[1,6]>
  1.1326 +	SMULWB	r3, r10,r8		; r3 = OC_C6S2*x[0,6]>>16
  1.1327 +	SMULWB	r10,r10,r7		; r10= OC_C6S2*x[1,6]>>16
  1.1328 +	PKHBT	r2, r2, r9, LSL #16	; r2 = <r2|r9>
  1.1329 +	SMLAWB	r3, r11,r0, r3		; r3 = t[0,3]=r3+(OC_C2S6*x[0,2]>>16)
  1.1330 +	SMLAWB	r10,r11,r12,r10		; r10= t[1,3]=r10+(OC_C2S6*x[1,2]>>16)
  1.1331 +	SMULWB	r9, r11,r8		; r9 = OC_C2S6*x[0,6]>>16
  1.1332 +	PKHBT	r3, r3, r10,LSL #16	; r3 = <t[1,6]|t[0,6]>
  1.1333 +	SMULWB	r12,r11,r7		; r12= OC_C2S6*x[1,6]>>16
  1.1334 +	;4-7 rotation by 7pi/16
  1.1335 +	LDRD	r10,OC_C7S1_8_v6	; r10= OC_C7S1, r11= OC_C1S7
  1.1336 +	PKHBT	r9, r9, r12,LSL #16	; r9 = <r9|r12>
  1.1337 +	LDR	r0, [r1],#16		; r0 = <x[0,1]|x[0,0]>
  1.1338 +	PKHTB	r7, r7, r8, ASR #16	; r7 = <x[1,7]|x[0,7]>
  1.1339 +	SSUB16	r2, r2, r9		; r2 = <t[1,2]|t[0,2]>
  1.1340 +	SMULWB	r9, r10,r7		; r9 = OC_C7S1*x[0,7]>>16
  1.1341 +	LDR	r14,[r1],#16		; r14= <x[1,1]|x[1,0]>
  1.1342 +	SMULWT	r12,r10,r7		; r12= OC_C7S1*x[1,7]>>16
  1.1343 +	SMULWT	r8, r10,r0		; r8 = OC_C7S1*x[0,1]>>16
  1.1344 +	SMULWT	r10,r10,r14		; r10= OC_C7S1*x[1,1]>>16
  1.1345 +	SMLAWT	r9, r11,r0, r9		; r9 = t[0,7]=r9+(OC_C1S7*x[0,1]>>16)
  1.1346 +	PKHBT	r8, r8, r10,LSL #16	; r8 = <r12|r8>
  1.1347 +	SMLAWT	r12,r11,r14,r12		; r12= t[1,7]=r12+(OC_C1S7*x[1,1]>>16)
  1.1348 +	PKHBT	r0, r0, r14,LSL #16	; r0 = <x[1,0]|x[0,0]>
  1.1349 +	SMULWB	r10,r11,r7		; r10= OC_C1S7*x[0,6]>>16
  1.1350 +	PKHBT	r9, r9, r12,LSL #16	; r9 = <t[1,7]|t[0,7]>
  1.1351 +	SMULWT	r12,r11,r7		; r12= OC_C1S7*x[1,6]>>16
  1.1352 +	;0-1 butterfly
  1.1353 +	LDR	r11,OC_C4S4
  1.1354 +	PKHBT	r10,r10,r12,LSL #16	; r10= <r12|r10>
  1.1355 +	SADD16	r7, r0, r4		; r7 = x[0]+x[4]
  1.1356 +	SSUB16	r10,r8, r10		; r10= <t[1,4]|t[0,4]>
  1.1357 +	SSUB16	r4, r0, r4		; r4 = x[0]-x[4]
  1.1358 +	SMULWB	r8, r11,r7		; r8 = t[0,0]=OC_C4S4*r7B>>16
  1.1359 +	SMULWT	r12,r11,r7		; r12= t[1,0]=OC_C4S4*r7T>>16
  1.1360 +	SMULWB	r7, r11,r4		; r7 = t[0,1]=OC_C4S4*r4B>>16
  1.1361 +	PKHBT	r12,r8, r12,LSL #16	; r12= <t[1,0]|t[0,0]>
  1.1362 +	SMULWT	r8, r11,r4		; r8 = t[1,1]=OC_C4S4*r4T>>16
  1.1363 +; Stage 2:
  1.1364 +	SADD16	r4, r10,r5		; r4 = t[4]'=t[4]+t[5]
  1.1365 +	PKHBT	r8, r7, r8, LSL #16	; r8 = <t[1,0]|t[0,0]>
  1.1366 +	SSUB16	r5, r10,r5		; r5 = t[4]-t[5]
  1.1367 +	SMULWB	r10,r11,r5		; r10= t[0,5]=OC_C4S4*r5B>>16
  1.1368 +	SADD16	r7, r9, r6		; r7 = t[7]'=t[7]+t[6]
  1.1369 +	SMULWT	r5, r11,r5		; r5 = t[1,5]=OC_C4S4*r5T>>16
  1.1370 +	SSUB16	r6, r9, r6		; r6 = t[7]-t[6]
  1.1371 +	SMULWB	r9, r11,r6		; r9 = t[0,6]=OC_C4S4*r6B>>16
  1.1372 +	PKHBT	r10,r10,r5, LSL #16	; r10= <t[1,5]|t[0,5]>
  1.1373 +	SMULWT	r6, r11,r6		; r6 = t[1,6]=OC_C4S4*r6T>>16
  1.1374 +; Stage 3:
  1.1375 +	SADD16	r11,r8, r2		; r11= t[1]'=t[1]+t[2]
  1.1376 +	PKHBT	r6, r9, r6, LSL #16	; r6 = <t[1,6]|t[0,6]>
  1.1377 +	SSUB16	r2, r8, r2		; r2 = t[2]=t[1]-t[2]
  1.1378 +	LDMFD	r13!,{r0,r14}
  1.1379 +	B	idct4_3core_stage3_5_v6
  1.1380 +	ENDP
  1.1381 +
  1.1382 +; Another copy so the LDRD offsets are less than +/- 255.
  1.1383 +	ALIGN 8
  1.1384 +OC_C7S1_8_v6
  1.1385 +	DCD	12785 ; 31F1
  1.1386 +OC_C1S7_8_v6
  1.1387 +	DCD	64277 ; FB15
  1.1388 +OC_C6S2_8_v6
  1.1389 +	DCD	25080 ; 61F8
  1.1390 +OC_C2S6_8_v6
  1.1391 +	DCD	60547 ; EC83
  1.1392 +OC_C5S3_8_v6
  1.1393 +	DCD	36410 ; 8E3A
  1.1394 +OC_C3S5_8_v6
  1.1395 +	DCD	54491 ; D4DB
  1.1396 +
  1.1397 +idct8_8core_down_v6 PROC
  1.1398 +	STMFD	r13!,{r0,r14}
  1.1399 +; Stage 1:
  1.1400 +	;5-6 rotation by 3pi/16
  1.1401 +	LDRD	r10,OC_C5S3_8_v6	; r10= OC_C5S3, r11= OC_C3S5
  1.1402 +	LDR	r4, [r1,#8]		; r4 = <x[0,5]|x[0,4]>
  1.1403 +	LDR	r7, [r1,#24]		; r7 = <x[1,5]|x[1,4]>
  1.1404 +	SMULWT	r5, r11,r4		; r5 = OC_C3S5*x[0,5]>>16
  1.1405 +	LDR	r0, [r1,#4]		; r0 = <x[0,3]|x[0,2]>
  1.1406 +	SMULWT	r3, r11,r7		; r3 = OC_C3S5*x[1,5]>>16
  1.1407 +	LDR	r12,[r1,#20]		; r12= <x[1,3]|x[1,2]>
  1.1408 +	SMULWT	r6, r11,r0		; r6 = OC_C3S5*x[0,3]>>16
  1.1409 +	SMULWT	r11,r11,r12		; r11= OC_C3S5*x[1,3]>>16
  1.1410 +	SMLAWT	r6, r10,r4, r6		; r6 = t[0,6]=r6+(OC_C5S3*x[0,5]>>16)
  1.1411 +	PKHBT	r5, r5, r3, LSL #16	; r5 = <r3|r5>
  1.1412 +	SMLAWT	r11,r10,r7, r11		; r11= t[1,6]=r11+(OC_C5S3*x[1,5]>>16)
  1.1413 +	PKHBT	r4, r4, r7, LSL #16	; r4 = <x[1,4]|x[0,4]>
  1.1414 +	SMULWT	r3, r10,r0		; r3 = OC_C5S3*x[0,3]>>16
  1.1415 +	PKHBT	r6, r6, r11,LSL #16	; r6 = <t[1,6]|t[0,6]>
  1.1416 +	SMULWT	r8, r10,r12		; r8 = OC_C5S3*x[1,3]>>16
  1.1417 +	;2-3 rotation by 6pi/16
  1.1418 +	LDRD	r10,OC_C6S2_8_v6	; r10= OC_C6S2, r11= OC_C2S6
  1.1419 +	PKHBT	r3, r3, r8, LSL #16	; r3 = <r8|r3>
  1.1420 +	LDR	r8, [r1,#12]		; r8 = <x[0,7]|x[0,6]>
  1.1421 +	SMULWB	r2, r10,r0		; r2 = OC_C6S2*x[0,2]>>16
  1.1422 +	SSUB16	r5, r5, r3		; r5 = <t[1,5]|t[0,5]>
  1.1423 +	SMULWB	r9, r10,r12		; r9 = OC_C6S2*x[1,2]>>16
  1.1424 +	LDR	r7, [r1,#28]		; r7 = <x[1,7]|x[1,6]>
  1.1425 +	SMULWB	r3, r10,r8		; r3 = OC_C6S2*x[0,6]>>16
  1.1426 +	SMULWB	r10,r10,r7		; r10= OC_C6S2*x[1,6]>>16
  1.1427 +	PKHBT	r2, r2, r9, LSL #16	; r2 = <r2|r9>
  1.1428 +	SMLAWB	r3, r11,r0, r3		; r3 = t[0,3]=r3+(OC_C2S6*x[0,2]>>16)
  1.1429 +	SMLAWB	r10,r11,r12,r10		; r10= t[1,3]=r10+(OC_C2S6*x[1,2]>>16)
  1.1430 +	SMULWB	r9, r11,r8		; r9 = OC_C2S6*x[0,6]>>16
  1.1431 +	PKHBT	r3, r3, r10,LSL #16	; r3 = <t[1,6]|t[0,6]>
  1.1432 +	SMULWB	r12,r11,r7		; r12= OC_C2S6*x[1,6]>>16
  1.1433 +	;4-7 rotation by 7pi/16
  1.1434 +	LDRD	r10,OC_C7S1_8_v6	; r10= OC_C7S1, r11= OC_C1S7
  1.1435 +	PKHBT	r9, r9, r12,LSL #16	; r9 = <r9|r12>
  1.1436 +	LDR	r0, [r1],#16		; r0 = <x[0,1]|x[0,0]>
  1.1437 +	PKHTB	r7, r7, r8, ASR #16	; r7 = <x[1,7]|x[0,7]>
  1.1438 +	SSUB16	r2, r2, r9		; r2 = <t[1,2]|t[0,2]>
  1.1439 +	SMULWB	r9, r10,r7		; r9 = OC_C7S1*x[0,7]>>16
  1.1440 +	LDR	r14,[r1],#16		; r14= <x[1,1]|x[1,0]>
  1.1441 +	SMULWT	r12,r10,r7		; r12= OC_C7S1*x[1,7]>>16
  1.1442 +	SMULWT	r8, r10,r0		; r8 = OC_C7S1*x[0,1]>>16
  1.1443 +	SMULWT	r10,r10,r14		; r10= OC_C7S1*x[1,1]>>16
  1.1444 +	SMLAWT	r9, r11,r0, r9		; r9 = t[0,7]=r9+(OC_C1S7*x[0,1]>>16)
  1.1445 +	PKHBT	r8, r8, r10,LSL #16	; r8 = <r12|r8>
  1.1446 +	SMLAWT	r12,r11,r14,r12		; r12= t[1,7]=r12+(OC_C1S7*x[1,1]>>16)
  1.1447 +	PKHBT	r0, r0, r14,LSL #16	; r0 = <x[1,0]|x[0,0]>
  1.1448 +	SMULWB	r10,r11,r7		; r10= OC_C1S7*x[0,6]>>16
  1.1449 +	PKHBT	r9, r9, r12,LSL #16	; r9 = <t[1,7]|t[0,7]>
  1.1450 +	SMULWT	r12,r11,r7		; r12= OC_C1S7*x[1,6]>>16
  1.1451 +	;0-1 butterfly
  1.1452 +	LDR	r11,OC_C4S4
  1.1453 +	MOV	r14,#8
  1.1454 +	PKHBT	r10,r10,r12,LSL #16	; r10= <r12|r10>
  1.1455 +	SADD16	r7, r0, r4		; r7 = x[0]+x[4]
  1.1456 +	SSUB16	r10,r8, r10		; r10= <t[1,4]|t[0,4]>
  1.1457 +	SMLAWB	r8, r11,r7, r14		; r8 = t[0,0]+8=(OC_C4S4*r7B>>16)+8
  1.1458 +	SSUB16	r4, r0, r4		; r4 = x[0]-x[4]
  1.1459 +	SMLAWT	r12,r11,r7, r14		; r12= t[1,0]+8=(OC_C4S4*r7T>>16)+8
  1.1460 +	SMLAWB	r7, r11,r4, r14		; r7 = t[0,1]+8=(OC_C4S4*r4B>>16)+8
  1.1461 +	PKHBT	r12,r8, r12,LSL #16	; r12= <t[1,0]+8|t[0,0]+8>
  1.1462 +	SMLAWT	r8, r11,r4, r14		; r8 = t[1,1]+8=(OC_C4S4*r4T>>16)+8
  1.1463 +; Stage 2:
  1.1464 +	SADD16	r4, r10,r5		; r4 = t[4]'=t[4]+t[5]
  1.1465 +	PKHBT	r8, r7, r8, LSL #16	; r8 = <t[1,0]+8|t[0,0]+8>
  1.1466 +	SSUB16	r5, r10,r5		; r5 = t[4]-t[5]
  1.1467 +	SMULWB	r10,r11,r5		; r10= t[0,5]=OC_C4S4*r5B>>16
  1.1468 +	SADD16	r7, r9, r6		; r7 = t[7]'=t[7]+t[6]
  1.1469 +	SMULWT	r5, r11,r5		; r5 = t[1,5]=OC_C4S4*r5T>>16
  1.1470 +	SSUB16	r6, r9, r6		; r6 = t[7]-t[6]
  1.1471 +	SMULWB	r9, r11,r6		; r9 = t[0,6]=OC_C4S4*r6B>>16
  1.1472 +	PKHBT	r10,r10,r5, LSL #16	; r10= <t[1,5]|t[0,5]>
  1.1473 +	SMULWT	r6, r11,r6		; r6 = t[1,6]=OC_C4S4*r6T>>16
  1.1474 +; Stage 3:
  1.1475 +	SADD16	r11,r8, r2		; r11= t[1]'+8=t[1]+t[2]+8
  1.1476 +	PKHBT	r6, r9, r6, LSL #16	; r6 = <t[1,6]|t[0,6]>
  1.1477 +	SSUB16	r2, r8, r2		; r2 = t[2]+8=t[1]-t[2]+8
  1.1478 +	LDMFD	r13!,{r0,r14}
  1.1479 +idct8_8core_down_stage3_5_v6
  1.1480 +	SSUB16	r5, r6, r10		; r5 = t[5]'=t[6]-t[5]
  1.1481 +	SADD16	r6, r6, r10		; r6 = t[6]=t[6]+t[5]
  1.1482 +	SADD16	r10,r12,r3		; r10= t[0]'+8=t[0]+t[3]+8
  1.1483 +	SSUB16	r3, r12,r3		; r3 = t[3]+8=t[0]-t[3]+8
  1.1484 +; Stage 4:
  1.1485 +	SADD16	r12,r10,r7		; r12= t[0]+t[7]+8
  1.1486 +	SSUB16	r7, r10,r7		; r7 = t[0]-t[7]+8
  1.1487 +	MOV	r10,r12,ASR #4
  1.1488 +	MOV	r12,r12,LSL #16
  1.1489 +	PKHTB	r10,r10,r12,ASR #20	; r10= t[0]+t[7]+8>>4
  1.1490 +	STR	r10,[r0], #4		; y[0<<3] = t[0]+t[7]+8>>4
  1.1491 +	SADD16	r12,r11,r6		; r12= t[1]+t[6]+8
  1.1492 +	SSUB16	r6, r11,r6		; r6 = t[1]-t[6]+8
  1.1493 +	MOV	r10,r12,ASR #4
  1.1494 +	MOV	r12,r12,LSL #16
  1.1495 +	PKHTB	r10,r10,r12,ASR #20	; r10= t[1]+t[6]+8>>4
  1.1496 +	STR	r10,[r0, #12]		; y[1<<3] = t[1]+t[6]+8>>4
  1.1497 +	SADD16	r12,r2, r5		; r12= t[2]+t[5]+8
  1.1498 +	SSUB16	r5, r2, r5		; r5 = t[2]-t[5]+8
  1.1499 +	MOV	r10,r12,ASR #4
  1.1500 +	MOV	r12,r12,LSL #16
  1.1501 +	PKHTB	r10,r10,r12,ASR #20	; r10= t[2]+t[5]+8>>4
  1.1502 +	STR	r10,[r0, #28]		; y[2<<3] = t[2]+t[5]+8>>4
  1.1503 +	SADD16	r12,r3, r4		; r12= t[3]+t[4]+8
  1.1504 +	SSUB16	r4, r3, r4		; r4 = t[3]-t[4]+8
  1.1505 +	MOV	r10,r12,ASR #4
  1.1506 +	MOV	r12,r12,LSL #16
  1.1507 +	PKHTB	r10,r10,r12,ASR #20	; r10= t[3]+t[4]+8>>4
  1.1508 +	STR	r10,[r0, #44]		; y[3<<3] = t[3]+t[4]+8>>4
  1.1509 +	MOV	r10,r4, ASR #4
  1.1510 +	MOV	r4, r4, LSL #16
  1.1511 +	PKHTB	r10,r10,r4, ASR #20	; r10= t[3]-t[4]+8>>4
  1.1512 +	STR	r10,[r0, #60]		; y[4<<3] = t[3]-t[4]+8>>4
  1.1513 +	MOV	r10,r5, ASR #4
  1.1514 +	MOV	r5, r5, LSL #16
  1.1515 +	PKHTB	r10,r10,r5, ASR #20	; r10= t[2]-t[5]+8>>4
  1.1516 +	STR	r10,[r0, #76]		; y[5<<3] = t[2]-t[5]+8>>4
  1.1517 +	MOV	r10,r6, ASR #4
  1.1518 +	MOV	r6, r6, LSL #16
  1.1519 +	PKHTB	r10,r10,r6, ASR #20	; r10= t[1]-t[6]+8>>4
  1.1520 +	STR	r10,[r0, #92]		; y[6<<3] = t[1]-t[6]+8>>4
  1.1521 +	MOV	r10,r7, ASR #4
  1.1522 +	MOV	r7, r7, LSL #16
  1.1523 +	PKHTB	r10,r10,r7, ASR #20	; r10= t[0]-t[7]+8>>4
  1.1524 +	STR	r10,[r0, #108]		; y[7<<3] = t[0]-t[7]+8>>4
  1.1525 +	MOV	PC,r14
  1.1526 +	ENDP
  1.1527 + ]
  1.1528 +
  1.1529 + [ OC_ARM_ASM_NEON
  1.1530 +	EXPORT	oc_idct8x8_1_neon
  1.1531 +	EXPORT	oc_idct8x8_neon
  1.1532 +
  1.1533 +	ALIGN 16
  1.1534 +OC_IDCT_CONSTS_NEON
  1.1535 +	DCW	    8
  1.1536 +	DCW	64277 ; FB15 (C1S7)
  1.1537 +	DCW	60547 ; EC83 (C2S6)
  1.1538 +	DCW	54491 ; D4DB (C3S5)
  1.1539 +	DCW	46341 ; B505 (C4S4)
  1.1540 +	DCW	36410 ; 471D (C5S3)
  1.1541 +	DCW	25080 ; 30FC (C6S2)
  1.1542 +	DCW	12785 ; 31F1 (C7S1)
  1.1543 +
  1.1544 +oc_idct8x8_1_neon PROC
  1.1545 +	; r0 = ogg_int16_t  *_y
  1.1546 +	; r1 = ogg_uint16_t  _dc
  1.1547 +	VDUP.S16	Q0, r1
  1.1548 +	VMOV		Q1, Q0
  1.1549 +	VST1.64		{D0, D1, D2, D3}, [r0@128]!
  1.1550 +	VST1.64		{D0, D1, D2, D3}, [r0@128]!
  1.1551 +	VST1.64		{D0, D1, D2, D3}, [r0@128]!
  1.1552 +	VST1.64		{D0, D1, D2, D3}, [r0@128]
  1.1553 +	MOV	PC, r14
  1.1554 +	ENDP
  1.1555 +
  1.1556 +oc_idct8x8_neon PROC
  1.1557 +	; r0 = ogg_int16_t *_y
  1.1558 +	; r1 = ogg_int16_t *_x
  1.1559 +	; r2 = int          _last_zzi
  1.1560 +	CMP	r2, #10
  1.1561 +	BLE	oc_idct8x8_10_neon
  1.1562 +oc_idct8x8_slow_neon
  1.1563 +	VPUSH		{D8-D15}
  1.1564 +	MOV	r2, r1
  1.1565 +	ADR	r3, OC_IDCT_CONSTS_NEON
  1.1566 +	; Row transforms (input is pre-transposed)
  1.1567 +	VLD1.64		{D16,D17,D18,D19}, [r2@128]!
  1.1568 +	VLD1.64		{D20,D21,D22,D23}, [r2@128]!
  1.1569 +	VLD1.64		{D24,D25,D26,D27}, [r2@128]!
  1.1570 +	VSUB.S16	Q1, Q8, Q12	; Q8 = x[0]-x[4]
  1.1571 +	VLD1.64		{D28,D29,D30,D31}, [r2@128]
  1.1572 +	VADD.S16	Q8, Q8, Q12	; Q1 = x[0]+x[4]
  1.1573 +	VLD1.64		{D0,D1},           [r3@128]
  1.1574 +	MOV	r12, r14
  1.1575 +	BL	oc_idct8x8_stage123_neon
  1.1576 +; Stage 4
  1.1577 +	VSUB.S16	Q15,Q8, Q7	; Q15 = y[7]=t[0]'-t[7]'
  1.1578 +	VADD.S16	Q8, Q8, Q7	; Q8  = y[0]=t[0]'+t[7]'
  1.1579 +	VSUB.S16	Q14,Q9, Q3	; Q14 = y[6]=t[1]'-t[6]''
  1.1580 +	VADD.S16	Q9, Q9, Q3	; Q9  = y[1]=t[1]'+t[6]''
  1.1581 +	VSUB.S16	Q13,Q10,Q5	; Q13 = y[5]=t[2]'-t[5]''
  1.1582 +	VADD.S16	Q10,Q10,Q5	; Q10 = y[2]=t[2]'+t[5]''
  1.1583 +	VTRN.16		Q14,Q15
  1.1584 +	VSUB.S16	Q12,Q11,Q4	; Q12 = y[4]=t[3]'-t[4]'
  1.1585 +	VADD.S16	Q11,Q11,Q4	; Q11 = y[3]=t[3]'+t[4]'
  1.1586 +	; 8x8 Transpose
  1.1587 +	VTRN.16		Q8, Q9
  1.1588 +	VTRN.16		Q10,Q11
  1.1589 +	VTRN.16		Q12,Q13
  1.1590 +	VTRN.32		Q8, Q10
  1.1591 +	VTRN.32		Q9, Q11
  1.1592 +	VTRN.32		Q12,Q14
  1.1593 +	VTRN.32		Q13,Q15
  1.1594 +	VSWP		D17,D24
  1.1595 +	VSUB.S16	Q1, Q8, Q12	; Q8 = x[0]-x[4]
  1.1596 +	VSWP		D19,D26
  1.1597 +	VADD.S16	Q8, Q8, Q12	; Q1 = x[0]+x[4]
  1.1598 +	VSWP		D21,D28
  1.1599 +	VSWP		D23,D30
  1.1600 +	; Column transforms
  1.1601 +	BL	oc_idct8x8_stage123_neon
  1.1602 +	CMP	r0,r1
  1.1603 +	; We have to put the return address back in the LR, or the branch
  1.1604 +	;  predictor will not recognize the function return and mis-predict the
  1.1605 +	;  entire call stack.
  1.1606 +	MOV	r14, r12
  1.1607 +; Stage 4
  1.1608 +	VSUB.S16	Q15,Q8, Q7	; Q15 = y[7]=t[0]'-t[7]'
  1.1609 +	VADD.S16	Q8, Q8, Q7	; Q8  = y[0]=t[0]'+t[7]'
  1.1610 +	VSUB.S16	Q14,Q9, Q3	; Q14 = y[6]=t[1]'-t[6]''
  1.1611 +	VADD.S16	Q9, Q9, Q3	; Q9  = y[1]=t[1]'+t[6]''
  1.1612 +	VSUB.S16	Q13,Q10,Q5	; Q13 = y[5]=t[2]'-t[5]''
  1.1613 +	VADD.S16	Q10,Q10,Q5	; Q10 = y[2]=t[2]'+t[5]''
  1.1614 +	VSUB.S16	Q12,Q11,Q4	; Q12 = y[4]=t[3]'-t[4]'
  1.1615 +	VADD.S16	Q11,Q11,Q4	; Q11 = y[3]=t[3]'+t[4]'
  1.1616 +	BEQ		oc_idct8x8_slow_neon_noclear
  1.1617 +	VMOV.I8		Q2,#0
  1.1618 +	VPOP		{D8-D15}
  1.1619 +	VMOV.I8		Q3,#0
  1.1620 +	VRSHR.S16	Q8, Q8, #4	; Q8  = y[0]+8>>4
  1.1621 +	VST1.64		{D4, D5, D6, D7}, [r1@128]!
  1.1622 +	VRSHR.S16	Q9, Q9, #4	; Q9  = y[1]+8>>4
  1.1623 +	VRSHR.S16	Q10,Q10,#4	; Q10 = y[2]+8>>4
  1.1624 +	VST1.64		{D4, D5, D6, D7}, [r1@128]!
  1.1625 +	VRSHR.S16	Q11,Q11,#4	; Q11 = y[3]+8>>4
  1.1626 +	VRSHR.S16	Q12,Q12,#4	; Q12 = y[4]+8>>4
  1.1627 +	VST1.64		{D4, D5, D6, D7}, [r1@128]!
  1.1628 +	VRSHR.S16	Q13,Q13,#4	; Q13 = y[5]+8>>4
  1.1629 +	VRSHR.S16	Q14,Q14,#4	; Q14 = y[6]+8>>4
  1.1630 +	VST1.64		{D4, D5, D6, D7}, [r1@128]
  1.1631 +	VRSHR.S16	Q15,Q15,#4	; Q15 = y[7]+8>>4
  1.1632 +	VSTMIA		r0, {D16-D31}
  1.1633 +	MOV	PC, r14
  1.1634 +
  1.1635 +oc_idct8x8_slow_neon_noclear
  1.1636 +	VPOP		{D8-D15}
  1.1637 +	VRSHR.S16	Q8, Q8, #4	; Q8  = y[0]+8>>4
  1.1638 +	VRSHR.S16	Q9, Q9, #4	; Q9  = y[1]+8>>4
  1.1639 +	VRSHR.S16	Q10,Q10,#4	; Q10 = y[2]+8>>4
  1.1640 +	VRSHR.S16	Q11,Q11,#4	; Q11 = y[3]+8>>4
  1.1641 +	VRSHR.S16	Q12,Q12,#4	; Q12 = y[4]+8>>4
  1.1642 +	VRSHR.S16	Q13,Q13,#4	; Q13 = y[5]+8>>4
  1.1643 +	VRSHR.S16	Q14,Q14,#4	; Q14 = y[6]+8>>4
  1.1644 +	VRSHR.S16	Q15,Q15,#4	; Q15 = y[7]+8>>4
  1.1645 +	VSTMIA		r0, {D16-D31}
  1.1646 +	MOV	PC, r14
  1.1647 +	ENDP
  1.1648 +
  1.1649 +oc_idct8x8_stage123_neon PROC
  1.1650 +; Stages 1 & 2
  1.1651 +	VMULL.S16	Q4, D18,D1[3]
  1.1652 +	VMULL.S16	Q5, D19,D1[3]
  1.1653 +	VMULL.S16	Q7, D30,D1[3]
  1.1654 +	VMULL.S16	Q6, D31,D1[3]
  1.1655 +	VMULL.S16	Q2, D30,D0[1]
  1.1656 +	VMULL.S16	Q3, D31,D0[1]
  1.1657 +	VSHRN.S32	D8, Q4, #16
  1.1658 +	VSHRN.S32	D9, Q5, #16	; Q4 = (OC_C7S1*x[1]>>16)
  1.1659 +	VSHRN.S32	D14,Q7, #16
  1.1660 +	VSHRN.S32	D15,Q6, #16	; Q7 = (OC_C7S1*x[7]>>16)
  1.1661 +	VSHRN.S32	D4, Q2, #16
  1.1662 +	VSHRN.S32	D5, Q3, #16	; Q2 = (OC_C1S7*x[7]>>16)-x[7]
  1.1663 +	VSUB.S16	Q4, Q4, Q15
  1.1664 +	VADD.S16	Q7, Q7, Q9
  1.1665 +	VSUB.S16	Q4, Q4, Q2	; Q4 = t[4]
  1.1666 +	VMULL.S16	Q2, D18,D0[1]
  1.1667 +	VMULL.S16	Q9, D19,D0[1]
  1.1668 +	VMULL.S16	Q5, D26,D0[3]
  1.1669 +	VMULL.S16	Q3, D27,D0[3]
  1.1670 +	VMULL.S16	Q6, D22,D0[3]
  1.1671 +	VMULL.S16	Q12,D23,D0[3]
  1.1672 +	VSHRN.S32	D4, Q2, #16
  1.1673 +	VSHRN.S32	D5, Q9, #16	; Q2 = (OC_C1S7*x[1]>>16)-x[1]
  1.1674 +	VSHRN.S32	D10,Q5, #16
  1.1675 +	VSHRN.S32	D11,Q3, #16	; Q5 = (OC_C3S5*x[5]>>16)-x[5]
  1.1676 +	VSHRN.S32	D12,Q6, #16
  1.1677 +	VSHRN.S32	D13,Q12,#16	; Q6 = (OC_C3S5*x[3]>>16)-x[3]
  1.1678 +	VADD.S16	Q7, Q7, Q2	; Q7 = t[7]
  1.1679 +	VSUB.S16	Q5, Q5, Q11
  1.1680 +	VADD.S16	Q6, Q6, Q11
  1.1681 +	VADD.S16	Q5, Q5, Q13
  1.1682 +	VADD.S16	Q6, Q6, Q13
  1.1683 +	VMULL.S16	Q9, D22,D1[1]
  1.1684 +	VMULL.S16	Q11,D23,D1[1]
  1.1685 +	VMULL.S16	Q15,D26,D1[1]
  1.1686 +	VMULL.S16	Q13,D27,D1[1]
  1.1687 +	VMULL.S16	Q2, D20,D1[2]
  1.1688 +	VMULL.S16	Q12,D21,D1[2]
  1.1689 +	VSHRN.S32	D18,Q9, #16
  1.1690 +	VSHRN.S32	D19,Q11,#16	; Q9 = (OC_C5S3*x[3]>>16)-x[3]
  1.1691 +	VSHRN.S32	D30,Q15,#16
  1.1692 +	VSHRN.S32	D31,Q13,#16	; Q15= (OC_C5S3*x[5]>>16)-x[5]
  1.1693 +	VSHRN.S32	D4, Q2, #16
  1.1694 +	VSHRN.S32	D5, Q12,#16	; Q2 = (OC_C6S2*x[2]>>16)
  1.1695 +	VSUB.S16	Q5, Q5, Q9	; Q5 = t[5]
  1.1696 +	VADD.S16	Q6, Q6, Q15	; Q6 = t[6]
  1.1697 +	VSUB.S16	Q2, Q2, Q14
  1.1698 +	VMULL.S16	Q3, D28,D1[2]
  1.1699 +	VMULL.S16	Q11,D29,D1[2]
  1.1700 +	VMULL.S16	Q12,D28,D0[2]
  1.1701 +	VMULL.S16	Q9, D29,D0[2]
  1.1702 +	VMULL.S16	Q13,D20,D0[2]
  1.1703 +	VMULL.S16	Q15,D21,D0[2]
  1.1704 +	VSHRN.S32	D6, Q3, #16
  1.1705 +	VSHRN.S32	D7, Q11,#16	; Q3 = (OC_C6S2*x[6]>>16)
  1.1706 +	VSHRN.S32	D24,Q12,#16
  1.1707 +	VSHRN.S32	D25,Q9, #16	; Q12= (OC_C2S6*x[6]>>16)-x[6]
  1.1708 +	VSHRN.S32	D26,Q13,#16
  1.1709 +	VSHRN.S32	D27,Q15,#16	; Q13= (OC_C2S6*x[2]>>16)-x[2]
  1.1710 +	VSUB.S16	Q9, Q4, Q5	; Q9 = t[4]-t[5]
  1.1711 +	VSUB.S16	Q11,Q7, Q6	; Q11= t[7]-t[6]
  1.1712 +	VADD.S16	Q3, Q3, Q10
  1.1713 +	VADD.S16	Q4, Q4, Q5	; Q4 = t[4]'=t[4]+t[5]
  1.1714 +	VADD.S16	Q7, Q7, Q6	; Q7 = t[7]'=t[7]+t[6]
  1.1715 +	VSUB.S16	Q2, Q2, Q12	; Q2 = t[2]
  1.1716 +	VADD.S16	Q3, Q3, Q13	; Q3 = t[3]
  1.1717 +	VMULL.S16	Q12,D16,D1[0]
  1.1718 +	VMULL.S16	Q13,D17,D1[0]
  1.1719 +	VMULL.S16	Q14,D2, D1[0]
  1.1720 +	VMULL.S16	Q15,D3, D1[0]
  1.1721 +	VMULL.S16	Q5, D18,D1[0]
  1.1722 +	VMULL.S16	Q6, D22,D1[0]
  1.1723 +	VSHRN.S32	D24,Q12,#16
  1.1724 +	VSHRN.S32	D25,Q13,#16
  1.1725 +	VSHRN.S32	D28,Q14,#16
  1.1726 +	VSHRN.S32	D29,Q15,#16
  1.1727 +	VMULL.S16	Q13,D19,D1[0]
  1.1728 +	VMULL.S16	Q15,D23,D1[0]
  1.1729 +	VADD.S16	Q8, Q8, Q12	; Q8 = t[0]
  1.1730 +	VADD.S16	Q1, Q1, Q14	; Q1 = t[1]
  1.1731 +	VSHRN.S32	D10,Q5, #16
  1.1732 +	VSHRN.S32	D12,Q6, #16
  1.1733 +	VSHRN.S32	D11,Q13,#16
  1.1734 +	VSHRN.S32	D13,Q15,#16
  1.1735 +	VADD.S16	Q5, Q5, Q9	; Q5 = t[5]'=OC_C4S4*(t[4]-t[5])>>16
  1.1736 +	VADD.S16	Q6, Q6, Q11	; Q6 = t[6]'=OC_C4S4*(t[7]-t[6])>>16
  1.1737 +; Stage 3
  1.1738 +	VSUB.S16	Q11,Q8, Q3	; Q11 = t[3]''=t[0]-t[3]
  1.1739 +	VADD.S16	Q8, Q8, Q3	; Q8  = t[0]''=t[0]+t[3]
  1.1740 +	VADD.S16	Q9, Q1, Q2	; Q9  = t[1]''=t[1]+t[2]
  1.1741 +	VADD.S16	Q3, Q6, Q5	; Q3  = t[6]''=t[6]'+t[5]'
  1.1742 +	VSUB.S16	Q10,Q1, Q2	; Q10 = t[2]''=t[1]-t[2]
  1.1743 +	VSUB.S16	Q5, Q6, Q5	; Q5  = t[5]''=t[6]'-t[5]'
  1.1744 +	MOV	PC, r14
  1.1745 +	ENDP
  1.1746 +
  1.1747 +oc_idct8x8_10_neon PROC
  1.1748 +	ADR	r3, OC_IDCT_CONSTS_NEON
  1.1749 +	VLD1.64		{D0,D1},          [r3@128]
  1.1750 +	MOV	r2, r1
  1.1751 +	; Row transforms (input is pre-transposed)
  1.1752 +; Stage 1
  1.1753 +	VLD1.64		{D16,D17,D18,D19},[r2@128]!
  1.1754 +	MOV	r12, #16
  1.1755 +	VMULL.S16	Q15,D16,D1[0]	; Q15= OC_C4S4*x[0]-(x[0]<<16)
  1.1756 +	VLD1.64		{D17},            [r2@64], r12
  1.1757 +	VMULL.S16	Q2, D18,D0[1]	; Q2 = OC_C1S7*x[1]-(x[1]<<16)
  1.1758 +	VLD1.64		{D19},            [r2@64]
  1.1759 +	VMULL.S16	Q14,D17,D0[2]	; Q14= OC_C2S6*x[2]-(x[2]<<16)
  1.1760 +	VMULL.S16	Q3, D19,D0[3]	; Q3 = OC_C3S5*x[3]-(x[3]<<16)
  1.1761 +	VMULL.S16	Q13,D19,D1[1]	; Q13= OC_C5S3*x[3]-(x[3]<<16)
  1.1762 +	VMULL.S16	Q12,D18,D1[3]	; Q12= OC_C7S1*x[1]
  1.1763 +	VMULL.S16	Q1, D17,D1[2]	; Q1 = OC_C6S2*x[2]
  1.1764 +	VSHRN.S32	D30,Q15,#16	; D30= t[0]-x[0]
  1.1765 +	VSHRN.S32	D4, Q2, #16	; D4 = t[7]-x[1]
  1.1766 +	VSHRN.S32	D31,Q14,#16	; D31= t[3]-x[2]
  1.1767 +	VSHRN.S32	D6, Q3, #16	; D6 = t[6]-x[3]
  1.1768 +	VSHRN.S32	D7, Q13,#16	; D7 = -t[5]-x[3]
  1.1769 +	VSHRN.S32	D5, Q12,#16	; D5 = t[4]
  1.1770 +	VSHRN.S32	D2, Q1, #16	; D2 = t[2]
  1.1771 +	VADD.S16	D4, D4, D18	; D4 = t[7]
  1.1772 +	VADD.S16	D6, D6, D19	; D6 = t[6]
  1.1773 +	VADD.S16	D7, D7, D19	; D7 = -t[5]
  1.1774 +	VADD.S16	Q15,Q15,Q8	; D30= t[0]
  1.1775 +					; D31= t[3]
  1.1776 +; Stages 2 & 3
  1.1777 +	VSUB.S16	Q12,Q2, Q3	; D24= t[7]-t[6]
  1.1778 +					; D25= t[4]'=t[4]+t[5]
  1.1779 +	VADD.S16	Q13,Q2, Q3	; D26= t[7]'=t[7]+t[6]
  1.1780 +					; D27= t[4]-t[5]
  1.1781 +	VMULL.S16	Q11,D24,D1[0]	; Q11= OC_C4S4*(t[7]-t[6])
  1.1782 +					;       -(t[7]-t[6]<<16)
  1.1783 +	VMULL.S16	Q14,D27,D1[0]	; Q14= OC_C4S4*(t[4]-t[5])
  1.1784 +					;       -(t[4]-t[5]<<16)
  1.1785 +	VADD.S16	D16,D30,D31	; D16= t[0]'=t[0]+t[3]
  1.1786 +	VSUB.S16	D17,D30,D2	; D17= t[2]'=t[0]-t[2]
  1.1787 +	VADD.S16	D18,D30,D2	; D18= t[1]'=t[0]+t[2]
  1.1788 +	VSHRN.S32	D22,Q11,#16	; D22= (OC_C4S4*(t[7]-t[6])>>16)
  1.1789 +					;       -(t[7]-t[6])
  1.1790 +	VSHRN.S32	D23,Q14,#16	; D23= (OC_C4S4*(t[4]-t[5])>>16)
  1.1791 +					;       -(t[4]-t[5])
  1.1792 +	VSUB.S16	D19,D30,D31	; D19= t[3]'=t[0]-t[3]
  1.1793 +	VADD.S16	D22,D22,D24	; D22= t[6]'=OC_C4S4*(t[7]-t[6])>>16
  1.1794 +	VADD.S16	D23,D23,D27	; D23= t[5]'=OC_C4S4*(t[4]-t[5])>>16
  1.1795 +	VSUB.S16	D27,D22,D23	; D27= t[5]''=t[6]'-t[5]'
  1.1796 +	VADD.S16	D24,D22,D23	; D24= t[6]''=t[6]'+t[5]'
  1.1797 +; Stage 4
  1.1798 +	VSUB.S16	Q11,Q8, Q13	; D22= y[7]=t[0]'-t[7]'
  1.1799 +					; D23= y[5]=t[2]'-t[5]''
  1.1800 +	VSUB.S16	Q10,Q9, Q12	; D20= y[6]=t[1]'-t[6]'
  1.1801 +					; D21= y[4]=t[3]'-t[4]''
  1.1802 +	VADD.S16	Q8, Q8, Q13	; D16= y[0]=t[0]'+t[7]'
  1.1803 +					; D17= y[2]=t[2]'+t[5]''
  1.1804 +	VADD.S16	Q9, Q9, Q12	; D18= y[1]=t[1]'-t[6]'
  1.1805 +					; D19= y[3]=t[3]'-t[4]''
  1.1806 +	; 8x4 transpose
  1.1807 +	VTRN.16		Q10,Q11		; Q10= c5c4a5a4 c7c6a7a6
  1.1808 +					; Q11= d5d4b5b4 d7d6b7b6
  1.1809 +	VTRN.16		Q8, Q9		; Q8 = c3c2a3a2 c1c0a1a0
  1.1810 +					; Q9 = d3d2b3b2 d1d0b1b0
  1.1811 +	VSWP		D20,D21		; Q10= c7c6a7a6 c5c4a5a4
  1.1812 +	VSWP		D22,D23		; Q11= d7d6b7b6 d5d4b5b4
  1.1813 +	VUZP.32		Q9, Q11		; Q9 = b7b6b5b4 b3b2b1b0
  1.1814 +					; Q11= d7d6d5d4 d3d2d1d0
  1.1815 +	VMULL.S16	Q15,D18,D0[1]
  1.1816 +	VMULL.S16	Q13,D22,D1[1]
  1.1817 +	VUZP.32		Q8, Q10		; Q8 = a7a6a5a4 a3a2a1a0
  1.1818 +					; Q10= c7c6c5c4 c3c2c1c0
  1.1819 +	; Column transforms
  1.1820 +; Stages 1, 2, & 3
  1.1821 +	VMULL.S16	Q14,D19,D0[1]	; Q14:Q15= OC_C1S7*x[1]-(x[1]<<16)
  1.1822 +	VMULL.S16	Q12,D23,D1[1]	; Q12:Q13= OC_C5S3*x[3]-(x[3]<<16)
  1.1823 +	VMULL.S16	Q3, D22,D0[3]
  1.1824 +	VMULL.S16	Q2, D23,D0[3]	;  Q2:Q3 = OC_C3S5*x[3]-(x[3]<<16)
  1.1825 +	VSHRN.S32	D30,Q15,#16
  1.1826 +	VSHRN.S32	D31,Q14,#16	; Q15= (OC_C1S7*x[1]>>16)-x[1]
  1.1827 +	VSHRN.S32	D26,Q13,#16
  1.1828 +	VSHRN.S32	D27,Q12,#16	; Q13= (OC_C5S3*x[3]>>16)-x[3]
  1.1829 +	VSHRN.S32	D28,Q3, #16
  1.1830 +	VSHRN.S32	D29,Q2, #16	; Q14= (OC_C3S5*x[3]>>16)-x[3]
  1.1831 +	VADD.S16	Q15,Q15,Q9	; Q15= t[7]
  1.1832 +	VADD.S16	Q13,Q13,Q11	; Q13= -t[5]
  1.1833 +	VADD.S16	Q14,Q14,Q11	; Q14= t[6]
  1.1834 +	VMULL.S16	Q12,D18,D1[3]
  1.1835 +	VMULL.S16	Q2, D19,D1[3]	;  Q2:Q12= OC_C7S1*x[1]
  1.1836 +	VMULL.S16	Q1, D16,D1[0]
  1.1837 +	VMULL.S16	Q11,D17,D1[0]	; Q11:Q1 = OC_C4S4*x[0]-(x[0]<<16)
  1.1838 +	VMULL.S16	Q3, D20,D0[2]
  1.1839 +	VMULL.S16	Q9, D21,D0[2]	;  Q9:Q3 = OC_C2S6*x[2]-(x[2]<<16)
  1.1840 +	VSHRN.S32	D24,Q12,#16
  1.1841 +	VSHRN.S32	D25,Q2, #16	; Q12= t[4]
  1.1842 +	VMULL.S16	Q2, D20,D1[2]
  1.1843 +	VSHRN.S32	D2, Q1, #16
  1.1844 +	VSHRN.S32	D3, Q11,#16	; Q1 = (OC_C4S4*x[0]>>16)-x[0]
  1.1845 +	VMULL.S16	Q11,D21,D1[2]	;  Q2:Q11= OC_C6S2*x[2]
  1.1846 +	VSHRN.S32	D6, Q3, #16
  1.1847 +	VSHRN.S32	D7, Q9, #16	; Q3 = (OC_C2S6*x[2]>>16)-x[2]
  1.1848 +	VSUB.S16	Q9, Q15,Q14	; Q9 = t[7]-t[6]
  1.1849 +	VADD.S16	Q15,Q15,Q14	; Q15= t[7]'=t[7]+t[6]
  1.1850 +	VSHRN.S32	D4, Q2, #16
  1.1851 +	VSHRN.S32	D5, Q11,#16	; Q2 = t[2]
  1.1852 +	VADD.S16	Q1, Q1, Q8	; Q1 = t[0]
  1.1853 +	VADD.S16	Q8, Q12,Q13	; Q8 = t[4]-t[5]
  1.1854 +	VADD.S16	Q3, Q3, Q10	; Q3 = t[3]
  1.1855 +	VMULL.S16	Q10,D16,D1[0]
  1.1856 +	VMULL.S16	Q11,D17,D1[0]	; Q11:Q10= OC_C4S4*(t[4]-t[5])
  1.1857 +					;           -(t[4]-t[5]<<16)
  1.1858 +	VSUB.S16	Q12,Q12,Q13	; Q12= t[4]'=t[4]+t[5]
  1.1859 +	VMULL.S16	Q14,D18,D1[0]
  1.1860 +	VMULL.S16	Q13,D19,D1[0]	; Q13:Q14= OC_C4S4*(t[6]-t[7])
  1.1861 +					;           -(t[6]-t[7]<<16)
  1.1862 +	VSHRN.S32	D20,Q10,#16
  1.1863 +	VSHRN.S32	D21,Q11,#16	; Q10= (OC_C4S4*(t[4]-t[5])>>16)
  1.1864 +					;       -(t[4]-t[5])
  1.1865 +	VADD.S16	Q11,Q1, Q3	; Q11= t[0]'=t[0]+t[3]
  1.1866 +	VSUB.S16	Q3, Q1, Q3	; Q3 = t[3]'=t[0]-t[3]
  1.1867 +	VSHRN.S32	D28,Q14,#16
  1.1868 +	VSHRN.S32	D29,Q13,#16	; Q14= (OC_C4S4*(t[7]-t[6])>>16)
  1.1869 +					;       -(t[7]-t[6])
  1.1870 +	VADD.S16	Q10,Q10,Q8	; Q10=t[5]'
  1.1871 +	VADD.S16	Q14,Q14,Q9	; Q14=t[6]'
  1.1872 +	VSUB.S16	Q13,Q14,Q10	; Q13=t[5]''=t[6]'-t[5]'
  1.1873 +	VADD.S16	Q14,Q14,Q10	; Q14=t[6]''=t[6]'+t[5]'
  1.1874 +	VADD.S16	Q10,Q1, Q2	; Q10= t[1]'=t[0]+t[2]
  1.1875 +	VSUB.S16	Q2, Q1, Q2	; Q2 = t[2]'=t[0]-t[2]
  1.1876 +; Stage 4
  1.1877 +	CMP	r0, r1
  1.1878 +	VADD.S16	Q8, Q11,Q15	; Q8  = y[0]=t[0]'+t[7]'
  1.1879 +	VADD.S16	Q9, Q10,Q14	; Q9  = y[1]=t[1]'+t[6]''
  1.1880 +	VSUB.S16	Q15,Q11,Q15	; Q15 = y[7]=t[0]'-t[7]'
  1.1881 +	VSUB.S16	Q14,Q10,Q14	; Q14 = y[6]=t[1]'-t[6]''
  1.1882 +	VADD.S16	Q10,Q2, Q13	; Q10 = y[2]=t[2]'+t[5]''
  1.1883 +	VADD.S16	Q11,Q3, Q12	; Q11 = y[3]=t[3]'+t[4]'
  1.1884 +	VSUB.S16	Q12,Q3, Q12	; Q12 = y[4]=t[3]'-t[4]'
  1.1885 +	VSUB.S16	Q13,Q2, Q13	; Q13 = y[5]=t[2]'-t[5]''
  1.1886 +	BEQ	oc_idct8x8_10_neon_noclear
  1.1887 +	VMOV.I8		D2, #0
  1.1888 +	VRSHR.S16	Q8, Q8, #4	; Q8  = y[0]+8>>4
  1.1889 +	VST1.64		{D2}, [r1@64], r12
  1.1890 +	VRSHR.S16	Q9, Q9, #4	; Q9  = y[1]+8>>4
  1.1891 +	VRSHR.S16	Q10,Q10,#4	; Q10 = y[2]+8>>4
  1.1892 +	VST1.64		{D2}, [r1@64], r12
  1.1893 +	VRSHR.S16	Q11,Q11,#4	; Q11 = y[3]+8>>4
  1.1894 +	VRSHR.S16	Q12,Q12,#4	; Q12 = y[4]+8>>4
  1.1895 +	VST1.64		{D2}, [r1@64], r12
  1.1896 +	VRSHR.S16	Q13,Q13,#4	; Q13 = y[5]+8>>4
  1.1897 +	VRSHR.S16	Q14,Q14,#4	; Q14 = y[6]+8>>4
  1.1898 +	VST1.64		{D2}, [r1@64]
  1.1899 +	VRSHR.S16	Q15,Q15,#4	; Q15 = y[7]+8>>4
  1.1900 +	VSTMIA		r0, {D16-D31}
  1.1901 +	MOV	PC, r14
  1.1902 +
  1.1903 +oc_idct8x8_10_neon_noclear
  1.1904 +	VRSHR.S16	Q8, Q8, #4	; Q8  = y[0]+8>>4
  1.1905 +	VRSHR.S16	Q9, Q9, #4	; Q9  = y[1]+8>>4
  1.1906 +	VRSHR.S16	Q10,Q10,#4	; Q10 = y[2]+8>>4
  1.1907 +	VRSHR.S16	Q11,Q11,#4	; Q11 = y[3]+8>>4
  1.1908 +	VRSHR.S16	Q12,Q12,#4	; Q12 = y[4]+8>>4
  1.1909 +	VRSHR.S16	Q13,Q13,#4	; Q13 = y[5]+8>>4
  1.1910 +	VRSHR.S16	Q14,Q14,#4	; Q14 = y[6]+8>>4
  1.1911 +	VRSHR.S16	Q15,Q15,#4	; Q15 = y[7]+8>>4
  1.1912 +	VSTMIA		r0, {D16-D31}
  1.1913 +	MOV	PC, r14
  1.1914 +	ENDP
  1.1915 + ]
  1.1916 +
  1.1917 +	END

mercurial