media/libtheora/lib/arm/armidct.s

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

     1 ;********************************************************************
     2 ;*                                                                  *
     3 ;* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
     4 ;* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
     5 ;* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
     6 ;* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
     7 ;*                                                                  *
     8 ;* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010                *
     9 ;* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
    10 ;*                                                                  *
    11 ;********************************************************************
    12 ; Original implementation:
    13 ;  Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd
    14 ; last mod: $Id: armidct.s 17481 2010-10-03 22:49:42Z tterribe $
    15 ;********************************************************************
    17 	AREA	|.text|, CODE, READONLY
    19 	; Explicitly specifying alignment here because some versions of
    20 	; gas don't align code correctly. See
    21 	; http://lists.gnu.org/archive/html/bug-binutils/2011-06/msg00199.html
    22 	; https://bugzilla.mozilla.org/show_bug.cgi?id=920992
    23 	ALIGN
    25 	GET	armopts.s
    27 	EXPORT	oc_idct8x8_1_arm
    28 	EXPORT	oc_idct8x8_arm
    30 oc_idct8x8_1_arm PROC
    31 	; r0 = ogg_int16_t  *_y
    32 	; r1 = ogg_uint16_t  _dc
    33 	ORR	r1, r1, r1, LSL #16
    34 	MOV	r2, r1
    35 	MOV	r3, r1
    36 	MOV	r12,r1
    37 	STMIA	r0!,{r1,r2,r3,r12}
    38 	STMIA	r0!,{r1,r2,r3,r12}
    39 	STMIA	r0!,{r1,r2,r3,r12}
    40 	STMIA	r0!,{r1,r2,r3,r12}
    41 	STMIA	r0!,{r1,r2,r3,r12}
    42 	STMIA	r0!,{r1,r2,r3,r12}
    43 	STMIA	r0!,{r1,r2,r3,r12}
    44 	STMIA	r0!,{r1,r2,r3,r12}
    45 	MOV	PC, r14
    46 	ENDP
    48 oc_idct8x8_arm PROC
    49 	; r0 = ogg_int16_t *_y
    50 	; r1 = ogg_int16_t *_x
    51 	; r2 = int          _last_zzi
    52 	CMP	r2, #3
    53 	BLE	oc_idct8x8_3_arm
    54 	CMP	r2, #6
    55 	BLE	oc_idct8x8_6_arm
    56 	CMP	r2, #10
    57 	BLE	oc_idct8x8_10_arm
    58 oc_idct8x8_slow_arm
    59 	STMFD	r13!,{r4-r11,r14}
    60 	SUB	r13,r13,#64*2
    61 ; Row transforms
    62 	STR	r0, [r13,#-4]!
    63 	ADD	r0, r13, #4	; Write to temp storage.
    64 	BL	idct8core_arm
    65 	BL	idct8core_arm
    66 	BL	idct8core_arm
    67 	BL	idct8core_arm
    68 	BL	idct8core_arm
    69 	BL	idct8core_arm
    70 	BL	idct8core_arm
    71 	BL	idct8core_arm
    72 	LDR	r0, [r13], #4	; Write to the final destination.
    73 	; Clear input data for next block (decoder only).
    74 	SUB	r2, r1, #8*16
    75 	CMP	r0, r2
    76 	MOV	r1, r13		; And read from temp storage.
    77 	BEQ	oc_idct8x8_slow_arm_cols
    78 	MOV	r4, #0
    79 	MOV	r5, #0
    80 	MOV	r6, #0
    81 	MOV	r7, #0
    82 	STMIA	r2!,{r4,r5,r6,r7}
    83 	STMIA	r2!,{r4,r5,r6,r7}
    84 	STMIA	r2!,{r4,r5,r6,r7}
    85 	STMIA	r2!,{r4,r5,r6,r7}
    86 	STMIA	r2!,{r4,r5,r6,r7}
    87 	STMIA	r2!,{r4,r5,r6,r7}
    88 	STMIA	r2!,{r4,r5,r6,r7}
    89 	STMIA	r2!,{r4,r5,r6,r7}
    90 oc_idct8x8_slow_arm_cols
    91 ; Column transforms
    92 	BL	idct8core_down_arm
    93 	BL	idct8core_down_arm
    94 	BL	idct8core_down_arm
    95 	BL	idct8core_down_arm
    96 	BL	idct8core_down_arm
    97 	BL	idct8core_down_arm
    98 	BL	idct8core_down_arm
    99 	BL	idct8core_down_arm
   100 	ADD	r13,r13,#64*2
   101 	LDMFD	r13!,{r4-r11,PC}
   102 	ENDP
   104 oc_idct8x8_10_arm PROC
   105 	STMFD	r13!,{r4-r11,r14}
   106 	SUB	r13,r13,#64*2
   107 ; Row transforms
   108 	MOV	r2, r0
   109 	MOV	r0, r13		; Write to temp storage.
   110 	BL	idct4core_arm
   111 	BL	idct3core_arm
   112 	BL	idct2core_arm
   113 	BL	idct1core_arm
   114 	; Clear input data for next block (decoder only).
   115 	SUB	r0, r1, #4*16
   116 	CMP	r0, r2
   117 	MOV	r1, r13		; Read from temp storage.
   118 	BEQ	oc_idct8x8_10_arm_cols
   119 	MOV	r4, #0
   120 	STR	r4, [r0]
   121 	STR	r4, [r0,#4]
   122 	STR	r4, [r0,#16]
   123 	STR	r4, [r0,#20]
   124 	STR	r4, [r0,#32]
   125 	STR	r4, [r0,#48]
   126 	MOV	r0, r2		; Write to the final destination
   127 oc_idct8x8_10_arm_cols
   128 ; Column transforms
   129 	BL	idct4core_down_arm
   130 	BL	idct4core_down_arm
   131 	BL	idct4core_down_arm
   132 	BL	idct4core_down_arm
   133 	BL	idct4core_down_arm
   134 	BL	idct4core_down_arm
   135 	BL	idct4core_down_arm
   136 	BL	idct4core_down_arm
   137 	ADD	r13,r13,#64*2
   138 	LDMFD	r13!,{r4-r11,PC}
   139 	ENDP
   141 oc_idct8x8_6_arm PROC
   142 	STMFD	r13!,{r4-r7,r9-r11,r14}
   143 	SUB	r13,r13,#64*2
   144 ; Row transforms
   145 	MOV	r2, r0
   146 	MOV	r0, r13		; Write to temp storage.
   147 	BL	idct3core_arm
   148 	BL	idct2core_arm
   149 	BL	idct1core_arm
   150 	; Clear input data for next block (decoder only).
   151 	SUB	r0, r1, #3*16
   152 	CMP	r0, r2
   153 	MOV	r1, r13		; Read from temp storage.
   154 	BEQ	oc_idct8x8_6_arm_cols
   155 	MOV	r4, #0
   156 	STR	r4, [r0]
   157 	STR	r4, [r0,#4]
   158 	STR	r4, [r0,#16]
   159 	STR	r4, [r0,#32]
   160 	MOV	r0, r2		; Write to the final destination
   161 oc_idct8x8_6_arm_cols
   162 ; Column transforms
   163 	BL	idct3core_down_arm
   164 	BL	idct3core_down_arm
   165 	BL	idct3core_down_arm
   166 	BL	idct3core_down_arm
   167 	BL	idct3core_down_arm
   168 	BL	idct3core_down_arm
   169 	BL	idct3core_down_arm
   170 	BL	idct3core_down_arm
   171 	ADD	r13,r13,#64*2
   172 	LDMFD	r13!,{r4-r7,r9-r11,PC}
   173 	ENDP
   175 oc_idct8x8_3_arm PROC
   176 	STMFD	r13!,{r4-r7,r9-r11,r14}
   177 	SUB	r13,r13,#64*2
   178 ; Row transforms
   179 	MOV	r2, r0
   180 	MOV	r0, r13		; Write to temp storage.
   181 	BL	idct2core_arm
   182 	BL	idct1core_arm
   183 	; Clear input data for next block (decoder only).
   184 	SUB	r0, r1, #2*16
   185 	CMP	r0, r2
   186 	MOV	r1, r13		; Read from temp storage.
   187 	MOVNE	r4, #0
   188 	STRNE	r4, [r0]
   189 	STRNE	r4, [r0,#16]
   190 	MOVNE	r0, r2		; Write to the final destination
   191 ; Column transforms
   192 	BL	idct2core_down_arm
   193 	BL	idct2core_down_arm
   194 	BL	idct2core_down_arm
   195 	BL	idct2core_down_arm
   196 	BL	idct2core_down_arm
   197 	BL	idct2core_down_arm
   198 	BL	idct2core_down_arm
   199 	BL	idct2core_down_arm
   200 	ADD	r13,r13,#64*2
   201 	LDMFD	r13!,{r4-r7,r9-r11,PC}
   202 	ENDP
   204 idct1core_arm PROC
   205 	; r0 =       ogg_int16_t *_y (destination)
   206 	; r1 = const ogg_int16_t *_x (source)
   207 	LDRSH	r3, [r1], #16
   208 	MOV	r12,#0x05
   209 	ORR	r12,r12,#0xB500
   210 	MUL	r3, r12, r3
   211 	; Stall ?
   212 	MOV	r3, r3, ASR #16
   213 	STRH	r3, [r0], #2
   214 	STRH	r3, [r0, #14]
   215 	STRH	r3, [r0, #30]
   216 	STRH	r3, [r0, #46]
   217 	STRH	r3, [r0, #62]
   218 	STRH	r3, [r0, #78]
   219 	STRH	r3, [r0, #94]
   220 	STRH	r3, [r0, #110]
   221 	MOV	PC,R14
   222 	ENDP
   224 idct2core_arm PROC
   225 	; r0 =       ogg_int16_t *_y (destination)
   226 	; r1 = const ogg_int16_t *_x (source)
   227 	LDRSH	r9, [r1], #16		; r9 = x[0]
   228 	LDR	r12,OC_C4S4
   229 	LDRSH	r11,[r1, #-14]		; r11= x[1]
   230 	LDR	r3, OC_C7S1
   231 	MUL	r9, r12,r9		; r9 = t[0]<<16 = OC_C4S4*x[0]
   232 	LDR	r10,OC_C1S7
   233 	MUL	r3, r11,r3		; r3 = t[4]<<16 = OC_C7S1*x[1]
   234 	MOV	r9, r9, ASR #16		; r9 = t[0]
   235 	MUL	r11,r10,r11		; r11= t[7]<<16 = OC_C1S7*x[1]
   236 	MOV	r3, r3, ASR #16		; r3 = t[4]
   237 	MUL	r10,r12,r3		; r10= t[5]<<16 = OC_C4S4*t[4]
   238 	MOV	r11,r11,ASR #16		; r11= t[7]
   239 	MUL	r12,r11,r12		; r12= t[6]<<16 = OC_C4S4*t[7]
   240 	MOV	r10,r10,ASR #16		; r10= t[5]
   241 	ADD	r12,r9,r12,ASR #16	; r12= t[0]+t[6]
   242 	ADD	r12,r12,r10		; r12= t[0]+t2[6] = t[0]+t[6]+t[5]
   243 	SUB	r10,r12,r10,LSL #1	; r10= t[0]+t2[5] = t[0]+t[6]-t[5]
   244 	ADD	r3, r3, r9		; r3 = t[0]+t[4]
   245 	ADD	r11,r11,r9		; r11= t[0]+t[7]
   246 	STRH	r11,[r0], #2		; y[0] = t[0]+t[7]
   247 	STRH	r12,[r0, #14]		; y[1] = t[0]+t[6]
   248 	STRH	r10,[r0, #30]		; y[2] = t[0]+t[5]
   249 	STRH	r3, [r0, #46]		; y[3] = t[0]+t[4]
   250 	RSB	r3, r3, r9, LSL #1	; r3 = t[0]*2-(t[0]+t[4])=t[0]-t[4]
   251 	RSB	r10,r10,r9, LSL #1	; r10= t[0]*2-(t[0]+t[5])=t[0]-t[5]
   252 	RSB	r12,r12,r9, LSL #1	; r12= t[0]*2-(t[0]+t[6])=t[0]-t[6]
   253 	RSB	r11,r11,r9, LSL #1	; r1 = t[0]*2-(t[0]+t[7])=t[0]-t[7]
   254 	STRH	r3, [r0, #62]		; y[4] = t[0]-t[4]
   255 	STRH	r10,[r0, #78]		; y[5] = t[0]-t[5]
   256 	STRH	r12,[r0, #94]		; y[6] = t[0]-t[6]
   257 	STRH	r11,[r0, #110]		; y[7] = t[0]-t[7]
   258 	MOV	PC,r14
   259 	ENDP
   261 idct2core_down_arm PROC
   262 	; r0 =       ogg_int16_t *_y (destination)
   263 	; r1 = const ogg_int16_t *_x (source)
   264 	LDRSH	r9, [r1], #16		; r9 = x[0]
   265 	LDR	r12,OC_C4S4
   266 	LDRSH	r11,[r1, #-14]		; r11= x[1]
   267 	LDR	r3, OC_C7S1
   268 	MUL	r9, r12,r9		; r9 = t[0]<<16 = OC_C4S4*x[0]
   269 	LDR	r10,OC_C1S7
   270 	MUL	r3, r11,r3		; r3 = t[4]<<16 = OC_C7S1*x[1]
   271 	MOV	r9, r9, ASR #16		; r9 = t[0]
   272 	MUL	r11,r10,r11		; r11= t[7]<<16 = OC_C1S7*x[1]
   273 	ADD	r9, r9, #8		; r9 = t[0]+8
   274 	MOV	r3, r3, ASR #16		; r3 = t[4]
   275 	MUL	r10,r12,r3		; r10= t[5]<<16 = OC_C4S4*t[4]
   276 	MOV	r11,r11,ASR #16		; r11= t[7]
   277 	MUL	r12,r11,r12		; r12= t[6]<<16 = OC_C4S4*t[7]
   278 	MOV	r10,r10,ASR #16		; r10= t[5]
   279 	ADD	r12,r9,r12,ASR #16	; r12= t[0]+t[6]+8
   280 	ADD	r12,r12,r10		; r12= t[0]+t2[6] = t[0]+t[6]+t[5]+8
   281 	SUB	r10,r12,r10,LSL #1	; r10= t[0]+t2[5] = t[0]+t[6]-t[5]+8
   282 	ADD	r3, r3, r9		; r3 = t[0]+t[4]+8
   283 	ADD	r11,r11,r9		; r11= t[0]+t[7]+8
   284 	; TODO: This is wrong.
   285 	; The C code truncates to 16 bits by storing to RAM and doing the
   286 	;  shifts later; we've got an extra 4 bits here.
   287 	MOV	r4, r11,ASR #4
   288 	MOV	r5, r12,ASR #4
   289 	MOV	r6, r10,ASR #4
   290 	MOV	r7, r3, ASR #4
   291 	RSB	r3, r3, r9, LSL #1	;r3 =t[0]*2+8-(t[0]+t[4])=t[0]-t[4]+8
   292 	RSB	r10,r10,r9, LSL #1	;r10=t[0]*2+8-(t[0]+t[5])=t[0]-t[5]+8
   293 	RSB	r12,r12,r9, LSL #1	;r12=t[0]*2+8-(t[0]+t[6])=t[0]-t[6]+8
   294 	RSB	r11,r11,r9, LSL #1	;r11=t[0]*2+8-(t[0]+t[7])=t[0]-t[7]+8
   295 	MOV	r3, r3, ASR #4
   296 	MOV	r10,r10,ASR #4
   297 	MOV	r12,r12,ASR #4
   298 	MOV	r11,r11,ASR #4
   299 	STRH	r4, [r0], #2		; y[0] = t[0]+t[7]
   300 	STRH	r5, [r0, #14]		; y[1] = t[0]+t[6]
   301 	STRH	r6, [r0, #30]		; y[2] = t[0]+t[5]
   302 	STRH	r7, [r0, #46]		; y[3] = t[0]+t[4]
   303 	STRH	r3, [r0, #62]		; y[4] = t[0]-t[4]
   304 	STRH	r10,[r0, #78]		; y[5] = t[0]-t[5]
   305 	STRH	r12,[r0, #94]		; y[6] = t[0]-t[6]
   306 	STRH	r11,[r0, #110]		; y[7] = t[0]-t[7]
   307 	MOV	PC,r14
   308 	ENDP
   310 idct3core_arm PROC
   311 	LDRSH	r9, [r1], #16		; r9 = x[0]
   312 	LDR	r12,OC_C4S4		; r12= OC_C4S4
   313 	LDRSH	r3, [r1, #-12]		; r3 = x[2]
   314 	LDR	r10,OC_C6S2		; r10= OC_C6S2
   315 	MUL	r9, r12,r9		; r9 = t[0]<<16 = OC_C4S4*x[0]
   316 	LDR	r4, OC_C2S6		; r4 = OC_C2S6
   317 	MUL	r10,r3, r10		; r10= t[2]<<16 = OC_C6S2*x[2]
   318 	LDRSH	r11,[r1, #-14]		; r11= x[1]
   319 	MUL	r3, r4, r3		; r3 = t[3]<<16 = OC_C2S6*x[2]
   320 	LDR	r4, OC_C7S1		; r4 = OC_C7S1
   321 	LDR	r5, OC_C1S7		; r5 = OC_C1S7
   322 	MOV	r9, r9, ASR #16		; r9 = t[0]
   323 	MUL	r4, r11,r4		; r4 = t[4]<<16 = OC_C7S1*x[1]
   324 	ADD	r3, r9, r3, ASR #16	; r3 = t[0]+t[3]
   325 	MUL	r11,r5, r11		; r11= t[7]<<16 = OC_C1S7*x[1]
   326 	MOV	r4, r4, ASR #16		; r4 = t[4]
   327 	MUL	r5, r12,r4		; r5 = t[5]<<16 = OC_C4S4*t[4]
   328 	MOV	r11,r11,ASR #16		; r11= t[7]
   329 	MUL	r12,r11,r12		; r12= t[6]<<16 = OC_C4S4*t[7]
   330 	ADD	r10,r9, r10,ASR #16	; r10= t[1] = t[0]+t[2]
   331 	RSB	r6, r10,r9, LSL #1	; r6 = t[2] = t[0]-t[2]
   332 					; r3 = t2[0] = t[0]+t[3]
   333 	RSB	r9, r3, r9, LSL #1	; r9 = t2[3] = t[0]-t[3]
   334 	MOV	r12,r12,ASR #16		; r12= t[6]
   335 	ADD	r5, r12,r5, ASR #16	; r5 = t2[6] = t[6]+t[5]
   336 	RSB	r12,r5, r12,LSL #1	; r12= t2[5] = t[6]-t[5]
   337 	ADD	r11,r3, r11		; r11= t2[0]+t[7]
   338 	ADD	r5, r10,r5		; r5 = t[1]+t2[6]
   339 	ADD	r12,r6, r12		; r12= t[2]+t2[5]
   340 	ADD	r4, r9, r4		; r4 = t2[3]+t[4]
   341 	STRH	r11,[r0], #2		; y[0] = t[0]+t[7]
   342 	STRH	r5, [r0, #14]		; y[1] = t[1]+t2[6]
   343 	STRH	r12,[r0, #30]		; y[2] = t[2]+t2[5]
   344 	STRH	r4, [r0, #46]		; y[3] = t2[3]+t[4]
   345 	RSB	r11,r11,r3, LSL #1	; r11= t2[0] - t[7]
   346 	RSB	r5, r5, r10,LSL #1	; r5 = t[1]  - t2[6]
   347 	RSB	r12,r12,r6, LSL #1	; r6 = t[2]  - t2[5]
   348 	RSB	r4, r4, r9, LSL #1	; r4 = t2[3] - t[4]
   349 	STRH	r4, [r0, #62]		; y[4] = t2[3]-t[4]
   350 	STRH	r12,[r0, #78]		; y[5] = t[2]-t2[5]
   351 	STRH	r5, [r0, #94]		; y[6] = t[1]-t2[6]
   352 	STRH	r11,[r0, #110]		; y[7] = t2[0]-t[7]
   353 	MOV	PC,R14
   354 	ENDP
   356 idct3core_down_arm PROC
   357 	LDRSH	r9, [r1], #16		; r9 = x[0]
   358 	LDR	r12,OC_C4S4		; r12= OC_C4S4
   359 	LDRSH	r3, [r1, #-12]		; r3 = x[2]
   360 	LDR	r10,OC_C6S2		; r10= OC_C6S2
   361 	MUL	r9, r12,r9		; r9 = t[0]<<16 = OC_C4S4*x[0]
   362 	LDR	r4, OC_C2S6		; r4 = OC_C2S6
   363 	MUL	r10,r3, r10		; r10= t[2]<<16 = OC_C6S2*x[2]
   364 	LDRSH	r11,[r1, #-14]		; r11= x[1]
   365 	MUL	r3, r4, r3		; r3 = t[3]<<16 = OC_C2S6*x[2]
   366 	LDR	r4, OC_C7S1		; r4 = OC_C7S1
   367 	LDR	r5, OC_C1S7		; r5 = OC_C1S7
   368 	MOV	r9, r9, ASR #16		; r9 = t[0]
   369 	MUL	r4, r11,r4		; r4 = t[4]<<16 = OC_C7S1*x[1]
   370 	ADD	r9, r9, #8		; r9 = t[0]+8
   371 	MUL	r11,r5, r11		; r11= t[7]<<16 = OC_C1S7*x[1]
   372 	ADD	r3, r9, r3, ASR #16	; r3 = t[0]+t[3]+8
   373 	MOV	r4, r4, ASR #16		; r4 = t[4]
   374 	MUL	r5, r12,r4		; r5 = t[5]<<16 = OC_C4S4*t[4]
   375 	MOV	r11,r11,ASR #16		; r11= t[7]
   376 	MUL	r12,r11,r12		; r12= t[6]<<16 = OC_C4S4*t[7]
   377 	ADD	r10,r9, r10,ASR #16	; r10= t[1]+8 = t[0]+t[2]+8
   378 	RSB	r6, r10,r9, LSL #1	; r6 = t[2]+8 = t[0]-t[2]+8
   379 					; r3 = t2[0]+8 = t[0]+t[3]+8
   380 	RSB	r9, r3, r9, LSL #1	; r9 = t2[3]+8 = t[0]-t[3]+8
   381 	MOV	r12,r12,ASR #16		; r12= t[6]
   382 	ADD	r5, r12,r5, ASR #16	; r5 = t2[6] = t[6]+t[5]
   383 	RSB	r12,r5, r12,LSL #1	; r12= t2[5] = t[6]-t[5]
   384 	ADD	r11,r3, r11		; r11= t2[0]+t[7] +8
   385 	ADD	r5, r10,r5		; r5 = t[1] +t2[6]+8
   386 	ADD	r12,r6, r12		; r12= t[2] +t2[5]+8
   387 	ADD	r4, r9, r4		; r4 = t2[3]+t[4] +8
   388 	RSB	r3, r11,r3, LSL #1	; r11= t2[0] - t[7]  + 8
   389 	RSB	r10,r5, r10,LSL #1	; r5 = t[1]  - t2[6] + 8
   390 	RSB	r6, r12,r6, LSL #1	; r6 = t[2]  - t2[5] + 8
   391 	RSB	r9, r4, r9, LSL #1	; r4 = t2[3] - t[4]  + 8
   392 	; TODO: This is wrong.
   393 	; The C code truncates to 16 bits by storing to RAM and doing the
   394 	;  shifts later; we've got an extra 4 bits here.
   395 	MOV	r11,r11,ASR #4
   396 	MOV	r5, r5, ASR #4
   397 	MOV	r12,r12,ASR #4
   398 	MOV	r4, r4, ASR #4
   399 	MOV	r9, r9, ASR #4
   400 	MOV	r6, r6, ASR #4
   401 	MOV	r10,r10,ASR #4
   402 	MOV	r3, r3, ASR #4
   403 	STRH	r11,[r0], #2		; y[0] = t[0]+t[7]
   404 	STRH	r5, [r0, #14]		; y[1] = t[1]+t2[6]
   405 	STRH	r12,[r0, #30]		; y[2] = t[2]+t2[5]
   406 	STRH	r4, [r0, #46]		; y[3] = t2[3]+t[4]
   407 	STRH	r9, [r0, #62]		; y[4] = t2[3]-t[4]
   408 	STRH	r6, [r0, #78]		; y[5] = t[2]-t2[5]
   409 	STRH	r10,[r0, #94]		; y[6] = t[1]-t2[6]
   410 	STRH	r3, [r0, #110]		; y[7] = t2[0]-t[7]
   411 	MOV	PC,R14
   412 	ENDP
   414 idct4core_arm PROC
   415 	; r0 =       ogg_int16_t *_y (destination)
   416 	; r1 = const ogg_int16_t *_x (source)
   417 	LDRSH	r9, [r1], #16		; r9 = x[0]
   418 	LDR	r10,OC_C4S4		; r10= OC_C4S4
   419 	LDRSH	r12,[r1, #-12]		; r12= x[2]
   420 	LDR	r4, OC_C6S2		; r4 = OC_C6S2
   421 	MUL	r9, r10,r9		; r9 = t[0]<<16 = OC_C4S4*x[0]
   422 	LDR	r5, OC_C2S6		; r5 = OC_C2S6
   423 	MUL	r4, r12,r4		; r4 = t[2]<<16 = OC_C6S2*x[2]
   424 	LDRSH	r3, [r1, #-14]		; r3 = x[1]
   425 	MUL	r5, r12,r5		; r5 = t[3]<<16 = OC_C2S6*x[2]
   426 	LDR	r6, OC_C7S1		; r6 = OC_C7S1
   427 	LDR	r12,OC_C1S7		; r12= OC_C1S7
   428 	LDRSH	r11,[r1, #-10]		; r11= x[3]
   429 	MUL	r6, r3, r6		; r6 = t[4]<<16 = OC_C7S1*x[1]
   430 	LDR	r7, OC_C5S3		; r7 = OC_C5S3
   431 	MUL	r3, r12,r3		; r3 = t[7]<<16 = OC_C1S7*x[1]
   432 	LDR	r8, OC_C3S5		; r8 = OC_C3S5
   433 	MUL	r7, r11,r7		; r7 = -t[5]<<16 = OC_C5S3*x[3]
   434 	MOV	r9, r9, ASR #16		; r9 = t[0]
   435 	MUL	r11,r8, r11		; r11= t[6]<<16 = OC_C3S5*x[3]
   436 	MOV	r6, r6, ASR #16		; r6 = t[4]
   437 ; TODO: This is wrong; t[4]-t[5] and t[7]-t[6] need to be truncated to 16-bit
   438 ; before multiplying, not after (this is not equivalent)
   439 	SUB	r7, r6, r7, ASR #16	; r7 = t2[4]=t[4]+t[5] (as r7=-t[5])
   440 	RSB	r6, r7, r6, LSL #1	; r6 = t[4]-t[5]
   441 	MUL	r6, r10,r6		; r6 = t2[5]<<16 =OC_C4S4*(t[4]-t[5])
   442 	MOV	r3, r3, ASR #16		; r3 = t[7]
   443 	ADD	r11,r3, r11,ASR #16	; r11= t2[7]=t[7]+t[6]
   444 	RSB	r3, r11,r3, LSL #1	; r3 = t[7]-t[6]
   445 	MUL	r3, r10,r3		; r3 = t2[6]<<16 =OC_C4S4*(t[7]-t[6])
   446 	ADD	r4, r9, r4, ASR #16	; r4 = t[1] = t[0] + t[2]
   447 	RSB	r10,r4, r9, LSL #1	; r10= t[2] = t[0] - t[2]
   448 	ADD	r5, r9, r5, ASR #16	; r5 = t[0] = t[0] + t[3]
   449 	RSB	r9, r5, r9, LSL #1	; r9 = t[3] = t[0] - t[3]
   450 	MOV	r3, r3, ASR #16		; r3 = t2[6]
   451 	ADD	r6, r3, r6, ASR #16	; r6 = t3[6] = t2[6]+t2[5]
   452 	RSB	r3, r6, r3, LSL #1	; r3 = t3[5] = t2[6]-t2[5]
   453 	ADD	r11,r5, r11		; r11= t[0]+t2[7]
   454 	ADD	r6, r4, r6		; r6 = t[1]+t3[6]
   455 	ADD	r3, r10,r3		; r3 = t[2]+t3[5]
   456 	ADD	r7, r9, r7		; r7 = t[3]+t2[4]
   457 	STRH	r11,[r0], #2		; y[0] = t[0]+t[7]
   458 	STRH	r6, [r0, #14]		; y[1] = t[1]+t2[6]
   459 	STRH	r3, [r0, #30]		; y[2] = t[2]+t2[5]
   460 	STRH	r7, [r0, #46]		; y[3] = t2[3]+t[4]
   461 	RSB	r11,r11,r5, LSL #1	; r11= t[0]-t2[7]
   462 	RSB	r6, r6, r4, LSL #1	; r6 = t[1]-t3[6]
   463 	RSB	r3, r3, r10,LSL #1	; r3 = t[2]-t3[5]
   464 	RSB	r7, r7, r9, LSL #1	; r7 = t[3]-t2[4]
   465 	STRH	r7, [r0, #62]		; y[4] = t2[3]-t[4]
   466 	STRH	r3, [r0, #78]		; y[5] = t[2]-t2[5]
   467 	STRH	r6, [r0, #94]		; y[6] = t[1]-t2[6]
   468 	STRH	r11, [r0, #110]		; y[7] = t2[0]-t[7]
   469 	MOV	PC,r14
   470 	ENDP
   472 idct4core_down_arm PROC
   473 	; r0 =       ogg_int16_t *_y (destination)
   474 	; r1 = const ogg_int16_t *_x (source)
   475 	LDRSH	r9, [r1], #16		; r9 = x[0]
   476 	LDR	r10,OC_C4S4		; r10= OC_C4S4
   477 	LDRSH	r12,[r1, #-12]		; r12= x[2]
   478 	LDR	r4, OC_C6S2		; r4 = OC_C6S2
   479 	MUL	r9, r10,r9		; r9 = t[0]<<16 = OC_C4S4*x[0]
   480 	LDR	r5, OC_C2S6		; r5 = OC_C2S6
   481 	MUL	r4, r12,r4		; r4 = t[2]<<16 = OC_C6S2*x[2]
   482 	LDRSH	r3, [r1, #-14]		; r3 = x[1]
   483 	MUL	r5, r12,r5		; r5 = t[3]<<16 = OC_C2S6*x[2]
   484 	LDR	r6, OC_C7S1		; r6 = OC_C7S1
   485 	LDR	r12,OC_C1S7		; r12= OC_C1S7
   486 	LDRSH	r11,[r1, #-10]		; r11= x[3]
   487 	MUL	r6, r3, r6		; r6 = t[4]<<16 = OC_C7S1*x[1]
   488 	LDR	r7, OC_C5S3		; r7 = OC_C5S3
   489 	MUL	r3, r12,r3		; r3 = t[7]<<16 = OC_C1S7*x[1]
   490 	LDR	r8, OC_C3S5		; r8 = OC_C3S5
   491 	MUL	r7, r11,r7		; r7 = -t[5]<<16 = OC_C5S3*x[3]
   492 	MOV	r9, r9, ASR #16		; r9 = t[0]
   493 	MUL	r11,r8, r11		; r11= t[6]<<16 = OC_C3S5*x[3]
   494 	MOV	r6, r6, ASR #16		; r6 = t[4]
   495 ; TODO: This is wrong; t[4]-t[5] and t[7]-t[6] need to be truncated to 16-bit
   496 ; before multiplying, not after (this is not equivalent)
   497 	SUB	r7, r6, r7, ASR #16	; r7 = t2[4]=t[4]+t[5] (as r7=-t[5])
   498 	RSB	r6, r7, r6, LSL #1	; r6 = t[4]-t[5]
   499 	MUL	r6, r10,r6		; r6 = t2[5]<<16 =OC_C4S4*(t[4]-t[5])
   500 	MOV	r3, r3, ASR #16		; r3 = t[7]
   501 	ADD	r11,r3, r11,ASR #16	; r11= t2[7]=t[7]+t[6]
   502 	RSB	r3, r11,r3, LSL #1	; r3 = t[7]-t[6]
   503 	ADD	r9, r9, #8		; r9 = t[0]+8
   504 	MUL	r3, r10,r3		; r3 = t2[6]<<16 =OC_C4S4*(t[7]-t[6])
   505 	ADD	r4, r9, r4, ASR #16	; r4 = t[1] = t[0] + t[2] + 8
   506 	RSB	r10,r4, r9, LSL #1	; r10= t[2] = t[0] - t[2] + 8
   507 	ADD	r5, r9, r5, ASR #16	; r5 = t[0] = t[0] + t[3] + 8
   508 	RSB	r9, r5, r9, LSL #1	; r9 = t[3] = t[0] - t[3] + 8
   509 	MOV	r3, r3, ASR #16		; r3 = t2[6]
   510 	ADD	r6, r3, r6, ASR #16	; r6 = t3[6] = t2[6]+t2[5]
   511 	RSB	r3, r6, r3, LSL #1	; r3 = t3[5] = t2[6]-t2[5]
   512 	ADD	r5, r5, r11		; r5 = t[0]+t2[7]+8
   513 	ADD	r4, r4, r6		; r4 = t[1]+t3[6]+8
   514 	ADD	r10,r10,r3		; r10= t[2]+t3[5]+8
   515 	ADD	r9, r9, r7		; r9 = t[3]+t2[4]+8
   516 	SUB	r11,r5, r11,LSL #1	; r11= t[0]-t2[7]+8
   517 	SUB	r6, r4, r6, LSL #1	; r6 = t[1]-t3[6]+8
   518 	SUB	r3, r10,r3, LSL #1	; r3 = t[2]-t3[5]+8
   519 	SUB	r7, r9, r7, LSL #1	; r7 = t[3]-t2[4]+8
   520 	; TODO: This is wrong.
   521 	; The C code truncates to 16 bits by storing to RAM and doing the
   522 	;  shifts later; we've got an extra 4 bits here.
   523 	MOV	r11,r11,ASR #4
   524 	MOV	r6, r6, ASR #4
   525 	MOV	r3, r3, ASR #4
   526 	MOV	r7, r7, ASR #4
   527 	MOV	r9, r9, ASR #4
   528 	MOV	r10,r10,ASR #4
   529 	MOV	r4, r4, ASR #4
   530 	MOV	r5, r5, ASR #4
   531 	STRH	r5,[r0], #2		; y[0] = t[0]+t[7]
   532 	STRH	r4, [r0, #14]		; y[1] = t[1]+t2[6]
   533 	STRH	r10,[r0, #30]		; y[2] = t[2]+t2[5]
   534 	STRH	r9, [r0, #46]		; y[3] = t2[3]+t[4]
   535 	STRH	r7, [r0, #62]		; y[4] = t2[3]-t[4]
   536 	STRH	r3, [r0, #78]		; y[5] = t[2]-t2[5]
   537 	STRH	r6, [r0, #94]		; y[6] = t[1]-t2[6]
   538 	STRH	r11,[r0, #110]		; y[7] = t2[0]-t[7]
   539 	MOV	PC,r14
   540 	ENDP
   542 idct8core_arm PROC
   543 	; r0 =       ogg_int16_t *_y (destination)
   544 	; r1 = const ogg_int16_t *_x (source)
   545 	LDRSH	r2, [r1],#16		; r2 = x[0]
   546 	STMFD	r13!,{r1,r14}
   547 	LDRSH	r6, [r1, #-8]		; r6 = x[4]
   548 	LDR	r12,OC_C4S4		; r12= C4S4
   549 	LDRSH	r4, [r1, #-12]		; r4 = x[2]
   550 	ADD	r2, r2, r6		; r2 = x[0] + x[4]
   551 	SUB	r6, r2, r6, LSL #1	; r6 = x[0] - x[4]
   552 	; For spec compliance, these sums must be truncated to 16-bit precision
   553 	; _before_ the multiply (not after).
   554 	; Sadly, ARMv4 provides no simple way to do that.
   555 	MOV	r2, r2, LSL #16
   556 	MOV	r6, r6, LSL #16
   557 	MOV	r2, r2, ASR #16
   558 	MOV	r6, r6, ASR #16
   559 	MUL	r2, r12,r2		; r2 = t[0]<<16 = C4S4*(x[0]+x[4])
   560 	LDRSH	r8, [r1, #-4]		; r8 = x[6]
   561 	LDR	r7, OC_C6S2		; r7 = OC_C6S2
   562 	MUL	r6, r12,r6		; r6 = t[1]<<16 = C4S4*(x[0]-x[4])
   563 	LDR	r14,OC_C2S6		; r14= OC_C2S6
   564 	MUL	r3, r4, r7		; r3 = OC_C6S2*x[2]
   565 	LDR	r5, OC_C7S1		; r5 = OC_C7S1
   566 	MUL	r4, r14,r4		; r4 = OC_C2S6*x[2]
   567 	MOV	r3, r3, ASR #16		; r3 = OC_C6S2*x[2]>>16
   568 	MUL	r14,r8, r14		; r14= OC_C2S6*x[6]
   569 	MOV	r4, r4, ASR #16		; r4 = OC_C2S6*x[2]>>16
   570 	MUL	r8, r7, r8		; r8 = OC_C6S2*x[6]
   571 	LDR	r7, OC_C1S7		; r7 = OC_C1S7
   572 	SUB	r3, r3, r14,ASR #16	; r3=t[2]=C6S2*x[2]>>16-C2S6*x[6]>>16
   573 	LDRSH	r14,[r1, #-14]		; r14= x[1]
   574 	ADD	r4, r4, r8, ASR #16	; r4=t[3]=C2S6*x[2]>>16+C6S2*x[6]>>16
   575 	LDRSH	r8, [r1, #-2]		; r8 = x[7]
   576 	MUL	r9, r5, r14		; r9 = OC_C7S1*x[1]
   577 	LDRSH	r10,[r1, #-6]		; r10= x[5]
   578 	MUL	r14,r7, r14		; r14= OC_C1S7*x[1]
   579 	MOV	r9, r9, ASR #16		; r9 = OC_C7S1*x[1]>>16
   580 	MUL	r7, r8, r7		; r7 = OC_C1S7*x[7]
   581 	MOV	r14,r14,ASR #16		; r14= OC_C1S7*x[1]>>16
   582 	MUL	r8, r5, r8		; r8 = OC_C7S1*x[7]
   583 	LDRSH	r1, [r1, #-10]		; r1 = x[3]
   584 	LDR	r5, OC_C3S5		; r5 = OC_C3S5
   585 	LDR	r11,OC_C5S3		; r11= OC_C5S3
   586 	ADD	r8, r14,r8, ASR #16	; r8=t[7]=C1S7*x[1]>>16+C7S1*x[7]>>16
   587 	MUL	r14,r5, r10		; r14= OC_C3S5*x[5]
   588 	SUB	r9, r9, r7, ASR #16	; r9=t[4]=C7S1*x[1]>>16-C1S7*x[7]>>16
   589 	MUL	r10,r11,r10		; r10= OC_C5S3*x[5]
   590 	MOV	r14,r14,ASR #16		; r14= OC_C3S5*x[5]>>16
   591 	MUL	r11,r1, r11		; r11= OC_C5S3*x[3]
   592 	MOV	r10,r10,ASR #16		; r10= OC_C5S3*x[5]>>16
   593 	MUL	r1, r5, r1		; r1 = OC_C3S5*x[3]
   594 	SUB	r14,r14,r11,ASR #16	;r14=t[5]=C3S5*x[5]>>16-C5S3*x[3]>>16
   595 	ADD	r10,r10,r1, ASR #16	;r10=t[6]=C5S3*x[5]>>16+C3S5*x[3]>>16
   596 	; r2=t[0]<<16 r3=t[2] r4=t[3] r6=t[1]<<16 r8=t[7] r9=t[4]
   597 	; r10=t[6] r12=C4S4 r14=t[5]
   598 ; TODO: This is wrong; t[4]-t[5] and t[7]-t[6] need to be truncated to 16-bit
   599 ; before multiplying, not after (this is not equivalent)
   600 	; Stage 2
   601 	; 4-5 butterfly
   602 	ADD	r9, r9, r14		; r9 = t2[4]     =       t[4]+t[5]
   603 	SUB	r14,r9, r14, LSL #1	; r14=                   t[4]-t[5]
   604 	MUL	r14,r12,r14		; r14= t2[5]<<16 = C4S4*(t[4]-t[5])
   605 	; 7-6 butterfly
   606 	ADD	r8, r8, r10		; r8 = t2[7]     =       t[7]+t[6]
   607 	SUB	r10,r8, r10, LSL #1	; r10=                   t[7]-t[6]
   608 	MUL	r10,r12,r10		; r10= t2[6]<<16 = C4S4*(t[7]+t[6])
   609 	; r2=t[0]<<16 r3=t[2] r4=t[3] r6=t[1]<<16 r8=t2[7] r9=t2[4]
   610 	; r10=t2[6]<<16 r12=C4S4 r14=t2[5]<<16
   611 	; Stage 3
   612 	; 0-3 butterfly
   613 	ADD	r2, r4, r2, ASR #16	; r2 = t2[0] = t[0] + t[3]
   614 	SUB	r4, r2, r4, LSL #1	; r4 = t2[3] = t[0] - t[3]
   615 	; 1-2 butterfly
   616 	ADD	r6, r3, r6, ASR #16	; r6 = t2[1] = t[1] + t[2]
   617 	SUB	r3, r6, r3, LSL #1	; r3 = t2[2] = t[1] - t[2]
   618 	; 6-5 butterfly
   619 	MOV	r14,r14,ASR #16		; r14= t2[5]
   620 	ADD	r10,r14,r10,ASR #16	; r10= t3[6] = t[6] + t[5]
   621 	SUB	r14,r10,r14,LSL #1	; r14= t3[5] = t[6] - t[5]
   622 	; r2=t2[0] r3=t2[2] r4=t2[3] r6=t2[1] r8=t2[7] r9=t2[4]
   623 	; r10=t3[6] r14=t3[5]
   624 	; Stage 4
   625 	ADD	r2, r2, r8		; r2 = t[0] + t[7]
   626 	ADD	r6, r6, r10		; r6 = t[1] + t[6]
   627 	ADD	r3, r3, r14		; r3 = t[2] + t[5]
   628 	ADD	r4, r4, r9		; r4 = t[3] + t[4]
   629 	SUB	r8, r2, r8, LSL #1	; r8 = t[0] - t[7]
   630 	SUB	r10,r6, r10,LSL #1	; r10= t[1] - t[6]
   631 	SUB	r14,r3, r14,LSL #1	; r14= t[2] - t[5]
   632 	SUB	r9, r4, r9, LSL #1	; r9 = t[3] - t[4]
   633 	STRH	r2, [r0], #2		; y[0] = t[0]+t[7]
   634 	STRH	r6, [r0, #14]		; y[1] = t[1]+t[6]
   635 	STRH	r3, [r0, #30]		; y[2] = t[2]+t[5]
   636 	STRH	r4, [r0, #46]		; y[3] = t[3]+t[4]
   637 	STRH	r9, [r0, #62]		; y[4] = t[3]-t[4]
   638 	STRH	r14,[r0, #78]		; y[5] = t[2]-t[5]
   639 	STRH	r10,[r0, #94]		; y[6] = t[1]-t[6]
   640 	STRH	r8, [r0, #110]		; y[7] = t[0]-t[7]
   641 	LDMFD	r13!,{r1,PC}
   642 	ENDP
   644 idct8core_down_arm PROC
   645 	; r0 =       ogg_int16_t *_y (destination)
   646 	; r1 = const ogg_int16_t *_x (source)
   647 	LDRSH	r2, [r1],#16		; r2 = x[0]
   648 	STMFD	r13!,{r1,r14}
   649 	LDRSH	r6, [r1, #-8]		; r6 = x[4]
   650 	LDR	r12,OC_C4S4		; r12= C4S4
   651 	LDRSH	r4, [r1, #-12]		; r4 = x[2]
   652 	ADD	r2, r2, r6		; r2 = x[0] + x[4]
   653 	SUB	r6, r2, r6, LSL #1	; r6 = x[0] - x[4]
   654 	; For spec compliance, these sums must be truncated to 16-bit precision
   655 	; _before_ the multiply (not after).
   656 	; Sadly, ARMv4 provides no simple way to do that.
   657 	MOV	r2, r2, LSL #16
   658 	MOV	r6, r6, LSL #16
   659 	MOV	r2, r2, ASR #16
   660 	MOV	r6, r6, ASR #16
   661 	MUL	r2, r12,r2		; r2 = t[0]<<16 = C4S4*(x[0]+x[4])
   662 	LDRSH	r8, [r1, #-4]		; r8 = x[6]
   663 	LDR	r7, OC_C6S2		; r7 = OC_C6S2
   664 	MUL	r6, r12,r6		; r6 = t[1]<<16 = C4S4*(x[0]-x[4])
   665 	LDR	r14,OC_C2S6		; r14= OC_C2S6
   666 	MUL	r3, r4, r7		; r3 = OC_C6S2*x[2]
   667 	LDR	r5, OC_C7S1		; r5 = OC_C7S1
   668 	MUL	r4, r14,r4		; r4 = OC_C2S6*x[2]
   669 	MOV	r3, r3, ASR #16		; r3 = OC_C6S2*x[2]>>16
   670 	MUL	r14,r8, r14		; r14= OC_C2S6*x[6]
   671 	MOV	r4, r4, ASR #16		; r4 = OC_C2S6*x[2]>>16
   672 	MUL	r8, r7, r8		; r8 = OC_C6S2*x[6]
   673 	LDR	r7, OC_C1S7		; r7 = OC_C1S7
   674 	SUB	r3, r3, r14,ASR #16	; r3=t[2]=C6S2*x[2]>>16-C2S6*x[6]>>16
   675 	LDRSH	r14,[r1, #-14]		; r14= x[1]
   676 	ADD	r4, r4, r8, ASR #16	; r4=t[3]=C2S6*x[2]>>16+C6S2*x[6]>>16
   677 	LDRSH	r8, [r1, #-2]		; r8 = x[7]
   678 	MUL	r9, r5, r14		; r9 = OC_C7S1*x[1]
   679 	LDRSH	r10,[r1, #-6]		; r10= x[5]
   680 	MUL	r14,r7, r14		; r14= OC_C1S7*x[1]
   681 	MOV	r9, r9, ASR #16		; r9 = OC_C7S1*x[1]>>16
   682 	MUL	r7, r8, r7		; r7 = OC_C1S7*x[7]
   683 	MOV	r14,r14,ASR #16		; r14= OC_C1S7*x[1]>>16
   684 	MUL	r8, r5, r8		; r8 = OC_C7S1*x[7]
   685 	LDRSH	r1, [r1, #-10]		; r1 = x[3]
   686 	LDR	r5, OC_C3S5		; r5 = OC_C3S5
   687 	LDR	r11,OC_C5S3		; r11= OC_C5S3
   688 	ADD	r8, r14,r8, ASR #16	; r8=t[7]=C1S7*x[1]>>16+C7S1*x[7]>>16
   689 	MUL	r14,r5, r10		; r14= OC_C3S5*x[5]
   690 	SUB	r9, r9, r7, ASR #16	; r9=t[4]=C7S1*x[1]>>16-C1S7*x[7]>>16
   691 	MUL	r10,r11,r10		; r10= OC_C5S3*x[5]
   692 	MOV	r14,r14,ASR #16		; r14= OC_C3S5*x[5]>>16
   693 	MUL	r11,r1, r11		; r11= OC_C5S3*x[3]
   694 	MOV	r10,r10,ASR #16		; r10= OC_C5S3*x[5]>>16
   695 	MUL	r1, r5, r1		; r1 = OC_C3S5*x[3]
   696 	SUB	r14,r14,r11,ASR #16	;r14=t[5]=C3S5*x[5]>>16-C5S3*x[3]>>16
   697 	ADD	r10,r10,r1, ASR #16	;r10=t[6]=C5S3*x[5]>>16+C3S5*x[3]>>16
   698 	; r2=t[0]<<16 r3=t[2] r4=t[3] r6=t[1]<<16 r8=t[7] r9=t[4]
   699 	; r10=t[6] r12=C4S4 r14=t[5]
   700 	; Stage 2
   701 ; TODO: This is wrong; t[4]-t[5] and t[7]-t[6] need to be truncated to 16-bit
   702 ; before multiplying, not after (this is not equivalent)
   703 	; 4-5 butterfly
   704 	ADD	r9, r9, r14		; r9 = t2[4]     =       t[4]+t[5]
   705 	SUB	r14,r9, r14, LSL #1	; r14=                   t[4]-t[5]
   706 	MUL	r14,r12,r14		; r14= t2[5]<<16 = C4S4*(t[4]-t[5])
   707 	; 7-6 butterfly
   708 	ADD	r8, r8, r10		; r8 = t2[7]     =       t[7]+t[6]
   709 	SUB	r10,r8, r10, LSL #1	; r10=                   t[7]-t[6]
   710 	MUL	r10,r12,r10		; r10= t2[6]<<16 = C4S4*(t[7]+t[6])
   711 	; r2=t[0]<<16 r3=t[2] r4=t[3] r6=t[1]<<16 r8=t2[7] r9=t2[4]
   712 	; r10=t2[6]<<16 r12=C4S4 r14=t2[5]<<16
   713 	; Stage 3
   714 	ADD	r2, r2, #8<<16		; r2 = t[0]+8<<16
   715 	ADD	r6, r6, #8<<16		; r6 = t[1]+8<<16
   716 	; 0-3 butterfly
   717 	ADD	r2, r4, r2, ASR #16	; r2 = t2[0] = t[0] + t[3] + 8
   718 	SUB	r4, r2, r4, LSL #1	; r4 = t2[3] = t[0] - t[3] + 8
   719 	; 1-2 butterfly
   720 	ADD	r6, r3, r6, ASR #16	; r6 = t2[1] = t[1] + t[2] + 8
   721 	SUB	r3, r6, r3, LSL #1	; r3 = t2[2] = t[1] - t[2] + 8
   722 	; 6-5 butterfly
   723 	MOV	r14,r14,ASR #16		; r14= t2[5]
   724 	ADD	r10,r14,r10,ASR #16	; r10= t3[6] = t[6] + t[5]
   725 	SUB	r14,r10,r14,LSL #1	; r14= t3[5] = t[6] - t[5]
   726 	; r2=t2[0] r3=t2[2] r4=t2[3] r6=t2[1] r8=t2[7] r9=t2[4]
   727 	; r10=t3[6] r14=t3[5]
   728 	; Stage 4
   729 	ADD	r2, r2, r8		; r2 = t[0] + t[7] + 8
   730 	ADD	r6, r6, r10		; r6 = t[1] + t[6] + 8
   731 	ADD	r3, r3, r14		; r3 = t[2] + t[5] + 8
   732 	ADD	r4, r4, r9		; r4 = t[3] + t[4] + 8
   733 	SUB	r8, r2, r8, LSL #1	; r8 = t[0] - t[7] + 8
   734 	SUB	r10,r6, r10,LSL #1	; r10= t[1] - t[6] + 8
   735 	SUB	r14,r3, r14,LSL #1	; r14= t[2] - t[5] + 8
   736 	SUB	r9, r4, r9, LSL #1	; r9 = t[3] - t[4] + 8
   737 	; TODO: This is wrong.
   738 	; The C code truncates to 16 bits by storing to RAM and doing the
   739 	;  shifts later; we've got an extra 4 bits here.
   740 	MOV	r2, r2, ASR #4
   741 	MOV	r6, r6, ASR #4
   742 	MOV	r3, r3, ASR #4
   743 	MOV	r4, r4, ASR #4
   744 	MOV	r8, r8, ASR #4
   745 	MOV	r10,r10,ASR #4
   746 	MOV	r14,r14,ASR #4
   747 	MOV	r9, r9, ASR #4
   748 	STRH	r2, [r0], #2		; y[0] = t[0]+t[7]
   749 	STRH	r6, [r0, #14]		; y[1] = t[1]+t[6]
   750 	STRH	r3, [r0, #30]		; y[2] = t[2]+t[5]
   751 	STRH	r4, [r0, #46]		; y[3] = t[3]+t[4]
   752 	STRH	r9, [r0, #62]		; y[4] = t[3]-t[4]
   753 	STRH	r14,[r0, #78]		; y[5] = t[2]-t[5]
   754 	STRH	r10,[r0, #94]		; y[6] = t[1]-t[6]
   755 	STRH	r8, [r0, #110]		; y[7] = t[0]-t[7]
   756 	LDMFD	r13!,{r1,PC}
   757 	ENDP
   759  [ OC_ARM_ASM_MEDIA
   760 	EXPORT	oc_idct8x8_1_v6
   761 	EXPORT	oc_idct8x8_v6
   763 oc_idct8x8_1_v6 PROC
   764 	; r0 = ogg_int16_t  *_y
   765 	; r1 = ogg_uint16_t  _dc
   766 	ORR	r2, r1, r1, LSL #16
   767 	ORR	r3, r1, r1, LSL #16
   768 	STRD	r2, [r0], #8
   769 	STRD	r2, [r0], #8
   770 	STRD	r2, [r0], #8
   771 	STRD	r2, [r0], #8
   772 	STRD	r2, [r0], #8
   773 	STRD	r2, [r0], #8
   774 	STRD	r2, [r0], #8
   775 	STRD	r2, [r0], #8
   776 	STRD	r2, [r0], #8
   777 	STRD	r2, [r0], #8
   778 	STRD	r2, [r0], #8
   779 	STRD	r2, [r0], #8
   780 	STRD	r2, [r0], #8
   781 	STRD	r2, [r0], #8
   782 	STRD	r2, [r0], #8
   783 	STRD	r2, [r0], #8
   784 	MOV	PC, r14
   785 	ENDP
   787 oc_idct8x8_v6 PROC
   788 	; r0 = ogg_int16_t *_y
   789 	; r1 = ogg_int16_t *_x
   790 	; r2 = int          _last_zzi
   791 	CMP	r2, #3
   792 	BLE	oc_idct8x8_3_v6
   793 	;CMP	r2, #6
   794 	;BLE	oc_idct8x8_6_v6
   795 	CMP	r2, #10
   796 	BLE	oc_idct8x8_10_v6
   797 oc_idct8x8_slow_v6
   798 	STMFD	r13!,{r4-r11,r14}
   799 	SUB	r13,r13,#64*2
   800 ; Row transforms
   801 	STR	r0, [r13,#-4]!
   802 	ADD	r0, r13, #4	; Write to temp storage.
   803 	BL	idct8_8core_v6
   804 	BL	idct8_8core_v6
   805 	BL	idct8_8core_v6
   806 	BL	idct8_8core_v6
   807 	LDR	r0, [r13], #4	; Write to the final destination.
   808 	; Clear input data for next block (decoder only).
   809 	SUB	r2, r1, #8*16
   810 	CMP	r0, r2
   811 	MOV	r1, r13		; And read from temp storage.
   812 	BEQ	oc_idct8x8_slow_v6_cols
   813 	MOV	r4, #0
   814 	MOV	r5, #0
   815 	STRD	r4, [r2], #8
   816 	STRD	r4, [r2], #8
   817 	STRD	r4, [r2], #8
   818 	STRD	r4, [r2], #8
   819 	STRD	r4, [r2], #8
   820 	STRD	r4, [r2], #8
   821 	STRD	r4, [r2], #8
   822 	STRD	r4, [r2], #8
   823 	STRD	r4, [r2], #8
   824 	STRD	r4, [r2], #8
   825 	STRD	r4, [r2], #8
   826 	STRD	r4, [r2], #8
   827 	STRD	r4, [r2], #8
   828 	STRD	r4, [r2], #8
   829 	STRD	r4, [r2], #8
   830 	STRD	r4, [r2], #8
   831 oc_idct8x8_slow_v6_cols
   832 ; Column transforms
   833 	BL	idct8_8core_down_v6
   834 	BL	idct8_8core_down_v6
   835 	BL	idct8_8core_down_v6
   836 	BL	idct8_8core_down_v6
   837 	ADD	r13,r13,#64*2
   838 	LDMFD	r13!,{r4-r11,PC}
   839 	ENDP
   841 oc_idct8x8_10_v6 PROC
   842 	STMFD	r13!,{r4-r11,r14}
   843 	SUB	r13,r13,#64*2+4
   844 ; Row transforms
   845 	MOV	r2, r13
   846 	STR	r0, [r13,#-4]!
   847 	AND	r0, r2, #4	; Align the stack.
   848 	ADD	r0, r0, r2	; Write to temp storage.
   849 	BL	idct4_3core_v6
   850 	BL	idct2_1core_v6
   851 	LDR	r0, [r13], #4	; Write to the final destination.
   852 	; Clear input data for next block (decoder only).
   853 	SUB	r2, r1, #4*16
   854 	CMP	r0, r2
   855 	AND	r1, r13,#4	; Align the stack.
   856 	BEQ	oc_idct8x8_10_v6_cols
   857 	MOV	r4, #0
   858 	MOV	r5, #0
   859 	STRD	r4, [r2]
   860 	STRD	r4, [r2,#16]
   861 	STR	r4, [r2,#32]
   862 	STR	r4, [r2,#48]
   863 oc_idct8x8_10_v6_cols
   864 ; Column transforms
   865 	ADD	r1, r1, r13	; And read from temp storage.
   866 	BL	idct4_4core_down_v6
   867 	BL	idct4_4core_down_v6
   868 	BL	idct4_4core_down_v6
   869 	BL	idct4_4core_down_v6
   870 	ADD	r13,r13,#64*2+4
   871 	LDMFD	r13!,{r4-r11,PC}
   872 	ENDP
   874 oc_idct8x8_3_v6 PROC
   875 	STMFD	r13!,{r4-r8,r14}
   876 	SUB	r13,r13,#64*2
   877 ; Row transforms
   878 	MOV	r8, r0
   879 	MOV	r0, r13		; Write to temp storage.
   880 	BL	idct2_1core_v6
   881 	; Clear input data for next block (decoder only).
   882 	SUB	r0, r1, #2*16
   883 	CMP	r0, r8
   884 	MOV	r1, r13		; Read from temp storage.
   885 	MOVNE	r4, #0
   886 	STRNE	r4, [r0]
   887 	STRNE	r4, [r0,#16]
   888 	MOVNE	r0, r8		; Write to the final destination.
   889 ; Column transforms
   890 	BL	idct2_2core_down_v6
   891 	BL	idct2_2core_down_v6
   892 	BL	idct2_2core_down_v6
   893 	BL	idct2_2core_down_v6
   894 	ADD	r13,r13,#64*2
   895 	LDMFD	r13!,{r4-r8,PC}
   896 	ENDP
   898 idct2_1core_v6 PROC
   899 	; r0 =       ogg_int16_t *_y (destination)
   900 	; r1 = const ogg_int16_t *_x (source)
   901 ; Stage 1:
   902 	LDR	r2, [r1], #16		; r2 = <x[0,1]|x[0,0]>
   903 	LDR	r3, OC_C4S4
   904 	LDRSH	r6, [r1], #16		; r6 = x[1,0]
   905 	SMULWB	r12,r3, r2		; r12= t[0,0]=OC_C4S4*x[0,0]>>16
   906 	LDRD	r4, OC_C7S1		; r4 = OC_C7S1; r5 = OC_C1S7
   907 	SMULWB	r6, r3, r6		; r6 = t[1,0]=OC_C4S4*x[1,0]>>16
   908 	SMULWT	r4, r4, r2		; r4 = t[0,4]=OC_C7S1*x[0,1]>>16
   909 	SMULWT	r7, r5, r2		; r7 = t[0,7]=OC_C1S7*x[0,1]>>16
   910 ; Stage 2:
   911 	SMULWB	r5, r3, r4		; r5 = t[0,5]=OC_C4S4*t[0,4]>>16
   912 	PKHBT	r12,r12,r6, LSL #16	; r12= <t[1,0]|t[0,0]>
   913 	SMULWB	r6, r3, r7		; r6 = t[0,6]=OC_C4S4*t[0,7]>>16
   914 	PKHBT	r7, r7, r3		; r7 = <0|t[0,7]>
   915 ; Stage 3:
   916 	PKHBT	r5, r6, r5, LSL #16	; r5 = <t[0,5]|t[0,6]>
   917 	PKHBT	r4, r4, r3		; r4 = <0|t[0,4]>
   918 	SASX	r5, r5, r5		; r5 = <t[0,6]+t[0,5]|t[0,6]-t[0,5]>
   919 ; Stage 4:
   920 	PKHTB	r6, r3, r5, ASR #16	; r6 = <0|t[0,6]>
   921 	PKHBT	r5, r5, r3		; r5 = <0|t[0,5]>
   922 	SADD16	r3, r12,r7		; r3 = t[0]+t[7]
   923 	STR	r3, [r0], #4		; y[0<<3] = t[0]+t[7]
   924 	SADD16	r3, r12,r6		; r3 = t[0]+t[6]
   925 	STR	r3, [r0, #12]		; y[1<<3] = t[0]+t[6]
   926 	SADD16	r3, r12,r5		; r3 = t[0]+t[5]
   927 	STR	r3, [r0, #28]		; y[2<<3] = t[0]+t[5]
   928 	SADD16	r3, r12,r4		; r3 = t[0]+t[4]
   929 	STR	r3, [r0, #44]		; y[3<<3] = t[0]+t[4]
   930 	SSUB16	r4, r12,r4		; r4 = t[0]-t[4]
   931 	STR	r4, [r0, #60]		; y[4<<3] = t[0]-t[4]
   932 	SSUB16	r5, r12,r5		; r5 = t[0]-t[5]
   933 	STR	r5, [r0, #76]		; y[5<<3] = t[0]-t[5]
   934 	SSUB16	r6, r12,r6		; r6 = t[0]-t[6]
   935 	STR	r6, [r0, #92]		; y[6<<3] = t[0]-t[6]
   936 	SSUB16	r7, r12,r7		; r7 = t[0]-t[7]
   937 	STR	r7, [r0, #108]		; y[7<<3] = t[0]-t[7]
   938 	MOV	PC,r14
   939 	ENDP
   940  ]
   942 	ALIGN 8
   943 OC_C7S1
   944 	DCD	12785 ; 31F1
   945 OC_C1S7
   946 	DCD	64277 ; FB15
   947 OC_C6S2
   948 	DCD	25080 ; 61F8
   949 OC_C2S6
   950 	DCD	60547 ; EC83
   951 OC_C5S3
   952 	DCD	36410 ; 8E3A
   953 OC_C3S5
   954 	DCD	54491 ; D4DB
   955 OC_C4S4
   956 	DCD	46341 ; B505
   958  [ OC_ARM_ASM_MEDIA
   959 idct2_2core_down_v6 PROC
   960 	; r0 =       ogg_int16_t *_y (destination)
   961 	; r1 = const ogg_int16_t *_x (source)
   962 ; Stage 1:
   963 	LDR	r2, [r1], #16		; r2 = <x[0,1]|x[0,0]>
   964 	LDR	r3, OC_C4S4
   965 	MOV	r7 ,#8			; r7  = 8
   966 	LDR	r6, [r1], #16		; r6 = <x[1,1]|x[1,0]>
   967 	SMLAWB	r12,r3, r2, r7		; r12= (t[0,0]=OC_C4S4*x[0,0]>>16)+8
   968 	LDRD	r4, OC_C7S1		; r4 = OC_C7S1; r5 = OC_C1S7
   969 	SMLAWB	r7, r3, r6, r7		; r7 = (t[1,0]=OC_C4S4*x[1,0]>>16)+8
   970 	SMULWT  r5, r5, r2		; r2 = t[0,7]=OC_C1S7*x[0,1]>>16
   971 	PKHBT	r12,r12,r7, LSL #16	; r12= <t[1,0]+8|t[0,0]+8>
   972 	SMULWT	r4, r4, r2		; r4 = t[0,4]=OC_C7S1*x[0,1]>>16
   973 ; Here we cheat: row 1 had just a DC, so x[0,1]==x[1,1] by definition.
   974 	PKHBT	r7, r5, r5, LSL #16	; r7 = <t[0,7]|t[0,7]>
   975 ; Stage 2:
   976 	SMULWB	r6, r3, r7		; r6 = t[0,6]=OC_C4S4*t[0,7]>>16
   977 	PKHBT	r4, r4, r4, LSL #16	; r4 = <t[0,4]|t[0,4]>
   978 	SMULWT	r2, r3, r7		; r2 = t[1,6]=OC_C4S4*t[1,7]>>16
   979 	SMULWB	r5, r3, r4		; r5 = t[0,5]=OC_C4S4*t[0,4]>>16
   980 	PKHBT	r6, r6, r2, LSL #16	; r6 = <t[1,6]|t[0,6]>
   981 	SMULWT	r2, r3, r4		; r2 = t[1,5]=OC_C4S4*t[1,4]>>16
   982 	PKHBT	r2, r5, r2, LSL #16	; r2 = <t[1,5]|t[0,5]>
   983 ; Stage 3:
   984 	SSUB16	r5, r6, r2		; r5 = <t[1,6]-t[1,5]|t[0,6]-t[0,5]>
   985 	SADD16	r6, r6, r2		; r6 = <t[1,6]+t[1,5]|t[0,6]+t[0,5]>
   986 ; Stage 4:
   987 	SADD16	r2, r12,r7		; r2 = t[0]+t[7]+8
   988 	MOV	r3, r2, ASR #4
   989 	MOV	r2, r2, LSL #16
   990 	PKHTB	r3, r3, r2, ASR #20	; r3 = t[0]+t[7]+8>>4
   991 	STR	r3, [r0], #4		; y[0<<3] = t[0]+t[7]+8>>4
   992 	SADD16	r2, r12,r6		; r2 = t[0]+t[6]+8
   993 	MOV	r3, r2, ASR #4
   994 	MOV	r2, r2, LSL #16
   995 	PKHTB	r3, r3, r2, ASR #20	; r3 = t[0]+t[6]+8>>4
   996 	STR	r3, [r0, #12]		; y[1<<3] = t[0]+t[6]+8>>4
   997 	SADD16	r2, r12,r5		; r2 = t[0]+t[5]+8
   998 	MOV	r3, r2, ASR #4
   999 	MOV	r2, r2, LSL #16
  1000 	PKHTB	r3, r3, r2, ASR #20	; r3 = t[0]+t[5]+8>>4
  1001 	STR	r3, [r0, #28]		; y[2<<3] = t[0]+t[5]+8>>4
  1002 	SADD16	r2, r12,r4		; r2 = t[0]+t[4]+8
  1003 	MOV	r3, r2, ASR #4
  1004 	MOV	r2, r2, LSL #16
  1005 	PKHTB	r3, r3, r2, ASR #20	; r3 = t[0]+t[4]+8>>4
  1006 	STR	r3, [r0, #44]		; y[3<<3] = t[0]+t[4]+8>>4
  1007 	SSUB16	r4, r12,r4		; r4 = t[0]-t[4]+8
  1008 	MOV	r3, r4, ASR #4
  1009 	MOV	r4, r4, LSL #16
  1010 	PKHTB	r3, r3, r4, ASR #20	; r3 = t[0]-t[4]+8>>4
  1011 	STR	r3, [r0, #60]		; y[4<<3] = t[0]-t[4]+8>>4
  1012 	SSUB16	r5, r12,r5		; r5 = t[0]-t[5]+8
  1013 	MOV	r3, r5, ASR #4
  1014 	MOV	r5, r5, LSL #16
  1015 	PKHTB	r3, r3, r5, ASR #20	; r3 = t[0]-t[5]+8>>4
  1016 	STR	r3, [r0, #76]		; y[5<<3] = t[0]-t[5]+8>>4
  1017 	SSUB16	r6, r12,r6		; r6 = t[0]-t[6]+8
  1018 	MOV	r3, r6, ASR #4
  1019 	MOV	r6, r6, LSL #16
  1020 	PKHTB	r3, r3, r6, ASR #20	; r3 = t[0]-t[6]+8>>4
  1021 	STR	r3, [r0, #92]		; y[6<<3] = t[0]-t[6]+8>>4
  1022 	SSUB16	r7, r12,r7		; r7 = t[0]-t[7]+8
  1023 	MOV	r3, r7, ASR #4
  1024 	MOV	r7, r7, LSL #16
  1025 	PKHTB	r3, r3, r7, ASR #20	; r3 = t[0]-t[7]+8>>4
  1026 	STR	r3, [r0, #108]		; y[7<<3] = t[0]-t[7]+8>>4
  1027 	MOV	PC,r14
  1028 	ENDP
  1030 ; In theory this should save ~75 cycles over oc_idct8x8_10, more than enough to
  1031 ;  pay for increased branch mis-prediction to get here, but in practice it
  1032 ;  doesn't seem to slow anything down to take it out, and it's less code this
  1033 ;  way.
  1034  [ 0
  1035 oc_idct8x8_6_v6 PROC
  1036 	STMFD	r13!,{r4-r8,r10,r11,r14}
  1037 	SUB	r13,r13,#64*2+4
  1038 ; Row transforms
  1039 	MOV	r8, r0
  1040 	AND	r0, r13,#4	; Align the stack.
  1041 	ADD	r0, r0, r13	; Write to temp storage.
  1042 	BL	idct3_2core_v6
  1043 	BL	idct1core_v6
  1044 	; Clear input data for next block (decoder only).
  1045 	SUB	r0, r1, #3*16
  1046 	CMP	r0, r8
  1047 	AND	r1, r13,#4	; Align the stack.
  1048 	BEQ	oc_idct8x8_6_v6_cols
  1049 	MOV	r4, #0
  1050 	MOV	r5, #0
  1051 	STRD	r4, [r0]
  1052 	STR	r4, [r0,#16]
  1053 	STR	r4, [r0,#32]
  1054 	MOV	r0, r8		; Write to the final destination.
  1055 oc_idct8x8_6_v6_cols
  1056 ; Column transforms
  1057 	ADD	r1, r1, r13	; And read from temp storage.
  1058 	BL	idct3_3core_down_v6
  1059 	BL	idct3_3core_down_v6
  1060 	BL	idct3_3core_down_v6
  1061 	BL	idct3_3core_down_v6
  1062 	ADD	r13,r13,#64*2+4
  1063 	LDMFD	r13!,{r4-r8,r10,r11,PC}
  1064 	ENDP
  1066 idct1core_v6 PROC
  1067 	; r0 =       ogg_int16_t *_y (destination)
  1068 	; r1 = const ogg_int16_t *_x (source)
  1069 	LDRSH	r3, [r1], #16
  1070 	MOV	r12,#0x05
  1071 	ORR	r12,r12,#0xB500
  1072 	MUL	r3, r12, r3
  1073 	; Stall ?
  1074 	MOV	r3, r3, ASR #16
  1075 	; Don't need to actually store the odd lines; they won't be read.
  1076 	STRH	r3, [r0], #2
  1077 	STRH	r3, [r0, #30]
  1078 	STRH	r3, [r0, #62]
  1079 	STRH	r3, [r0, #94]
  1080 	MOV	PC,R14
  1081 	ENDP
  1083 idct3_2core_v6 PROC
  1084 	; r0 =       ogg_int16_t *_y (destination)
  1085 	; r1 = const ogg_int16_t *_x (source)
  1086 ; Stage 1:
  1087 	LDRD	r4, [r1], #16		; r4 = <x[0,1]|x[0,0]>; r5 = <*|x[0,2]>
  1088 	LDRD	r10,OC_C6S2_3_v6	; r10= OC_C6S2; r11= OC_C2S6
  1089 	; Stall
  1090 	SMULWB	r3, r11,r5		; r3 = t[0,3]=OC_C2S6*x[0,2]>>16
  1091 	LDR	r11,OC_C4S4
  1092 	SMULWB	r2, r10,r5		; r2 = t[0,2]=OC_C6S2*x[0,2]>>16
  1093 	LDR	r5, [r1], #16		; r5 = <x[1,1]|x[1,0]>
  1094 	SMULWB	r12,r11,r4		; r12= (t[0,0]=OC_C4S4*x[0,0]>>16)
  1095 	LDRD	r6, OC_C7S1_3_v6	; r6 = OC_C7S1; r7 = OC_C1S7
  1096 	SMULWB	r10,r11,r5		; r10= (t[1,0]=OC_C4S4*x[1,0]>>16)
  1097 	PKHBT	r12,r12,r10,LSL #16	; r12= <t[1,0]|t[0,0]>
  1098 	SMULWT  r10,r7, r5		; r10= t[1,7]=OC_C1S7*x[1,1]>>16
  1099 	PKHBT	r2, r2, r11		; r2 = <0|t[0,2]>
  1100 	SMULWT  r7, r7, r4		; r7 = t[0,7]=OC_C1S7*x[0,1]>>16
  1101 	PKHBT	r3, r3, r11		; r3 = <0|t[0,3]>
  1102 	SMULWT	r5, r6, r5		; r10= t[1,4]=OC_C7S1*x[1,1]>>16
  1103 	PKHBT	r7, r7, r10,LSL #16	; r7 = <t[1,7]|t[0,7]>
  1104 	SMULWT	r4, r6, r4		; r4 = t[0,4]=OC_C7S1*x[0,1]>>16
  1105 ; Stage 2:
  1106 	SMULWB	r6, r11,r7		; r6 = t[0,6]=OC_C4S4*t[0,7]>>16
  1107 	PKHBT	r4, r4, r5, LSL #16	; r4 = <t[1,4]|t[0,4]>
  1108 	SMULWT	r10,r11,r7		; r10= t[1,6]=OC_C4S4*t[1,7]>>16
  1109 	SMULWB	r5, r11,r4		; r5 = t[0,5]=OC_C4S4*t[0,4]>>16
  1110 	PKHBT	r6, r6, r10,LSL #16	; r6 = <t[1,6]|t[0,6]>
  1111 	SMULWT	r10,r11,r4		; r10= t[1,5]=OC_C4S4*t[1,4]>>16
  1112 ; Stage 3:
  1113 	B	idct4_3core_stage3_v6
  1114 	ENDP
  1116 ; Another copy so the LDRD offsets are less than +/- 255.
  1117 	ALIGN 8
  1118 OC_C7S1_3_v6
  1119 	DCD	12785 ; 31F1
  1120 OC_C1S7_3_v6
  1121 	DCD	64277 ; FB15
  1122 OC_C6S2_3_v6
  1123 	DCD	25080 ; 61F8
  1124 OC_C2S6_3_v6
  1125 	DCD	60547 ; EC83
  1127 idct3_3core_down_v6 PROC
  1128 	; r0 =       ogg_int16_t *_y (destination)
  1129 	; r1 = const ogg_int16_t *_x (source)
  1130 ; Stage 1:
  1131 	LDRD	r10,[r1], #16		; r10= <x[0,1]|x[0,0]>; r11= <??|x[0,2]>
  1132 	LDRD	r6, OC_C6S2_3_v6	; r6 = OC_C6S2; r7 = OC_C2S6
  1133 	LDR	r4, [r1], #16		; r4 = <x[1,1]|x[1,0]>
  1134 	SMULWB	r3, r7, r11		; r3 = t[0,3]=OC_C2S6*x[0,2]>>16
  1135 	MOV	r7,#8
  1136 	SMULWB	r2, r6, r11		; r2 = t[0,2]=OC_C6S2*x[0,2]>>16
  1137 	LDR	r11,OC_C4S4
  1138 	SMLAWB	r12,r11,r10,r7		; r12= t[0,0]+8=(OC_C4S4*x[0,0]>>16)+8
  1139 ; Here we cheat: row 2 had just a DC, so x[0,2]==x[1,2] by definition.
  1140 	PKHBT	r3, r3, r3, LSL #16	; r3 = <t[0,3]|t[0,3]>
  1141 	SMLAWB	r5, r11,r4, r7		; r5 = t[1,0]+8=(OC_C4S4*x[1,0]>>16)+8
  1142 	PKHBT	r2, r2, r2, LSL #16	; r2 = <t[0,2]|t[0,2]>
  1143 	LDRD	r6, OC_C7S1_3_v6	; r6 = OC_C7S1; r7 = OC_C1S7
  1144 	PKHBT	r12,r12,r5, LSL #16	; r12= <t[1,0]+8|t[0,0]+8>
  1145 	SMULWT  r5, r7, r4		; r5 = t[1,7]=OC_C1S7*x[1,1]>>16
  1146 	SMULWT  r7, r7, r10		; r7 = t[0,7]=OC_C1S7*x[0,1]>>16
  1147 	SMULWT	r10,r6, r10		; r10= t[0,4]=OC_C7S1*x[0,1]>>16
  1148 	PKHBT	r7, r7, r5, LSL #16	; r7 = <t[1,7]|t[0,7]>
  1149 	SMULWT	r4, r6, r4		; r4 = t[1,4]=OC_C7S1*x[1,1]>>16
  1150 ; Stage 2:
  1151 	SMULWB	r6, r11,r7		; r6 = t[0,6]=OC_C4S4*t[0,7]>>16
  1152 	PKHBT	r4, r10,r4, LSL #16	; r4 = <t[1,4]|t[0,4]>
  1153 	SMULWT	r10,r11,r7		; r10= t[1,6]=OC_C4S4*t[1,7]>>16
  1154 	SMULWB	r5, r11,r4		; r5 = t[0,5]=OC_C4S4*t[0,4]>>16
  1155 	PKHBT	r6, r6, r10,LSL #16	; r6 = <t[1,6]|t[0,6]>
  1156 	SMULWT	r10,r11,r4		; r10= t[1,5]=OC_C4S4*t[1,4]>>16
  1157 ; Stage 3:
  1158 	B	idct4_4core_down_stage3_v6
  1159 	ENDP
  1162 idct4_3core_v6 PROC
  1163 	; r0 =       ogg_int16_t *_y (destination)
  1164 	; r1 = const ogg_int16_t *_x (source)
  1165 ; Stage 1:
  1166 	LDRD	r10,[r1], #16	; r10= <x[0,1]|x[0,0]>; r11= <x[0,3]|x[0,2]>
  1167 	LDRD	r2, OC_C5S3_4_v6	; r2 = OC_C5S3; r3 = OC_C3S5
  1168 	LDRD	r4, [r1], #16		; r4 = <x[1,1]|x[1,0]>; r5 = <??|x[1,2]>
  1169 	SMULWT	r9, r3, r11		; r9 = t[0,6]=OC_C3S5*x[0,3]>>16
  1170 	SMULWT	r8, r2, r11		; r8 = -t[0,5]=OC_C5S3*x[0,3]>>16
  1171 	PKHBT	r9, r9, r2		; r9 = <0|t[0,6]>
  1172 	LDRD	r6, OC_C6S2_4_v6	; r6 = OC_C6S2; r7 = OC_C2S6
  1173 	PKHBT	r8, r8, r2		; r9 = <0|-t[0,5]>
  1174 	SMULWB	r3, r7, r11		; r3 = t[0,3]=OC_C2S6*x[0,2]>>16
  1175 	SMULWB	r2, r6, r11		; r2 = t[0,2]=OC_C6S2*x[0,2]>>16
  1176 	LDR	r11,OC_C4S4
  1177 	SMULWB	r12,r7, r5		; r12= t[1,3]=OC_C2S6*x[1,2]>>16
  1178 	SMULWB	r5, r6, r5		; r5 = t[1,2]=OC_C6S2*x[1,2]>>16
  1179 	PKHBT	r3, r3, r12,LSL #16	; r3 = <t[1,3]|t[0,3]>
  1180 	SMULWB	r12,r11,r10		; r12= t[0,0]=OC_C4S4*x[0,0]>>16
  1181 	PKHBT	r2, r2, r5, LSL #16	; r2 = <t[1,2]|t[0,2]>
  1182 	SMULWB	r5, r11,r4		; r5 = t[1,0]=OC_C4S4*x[1,0]>>16
  1183 	LDRD	r6, OC_C7S1_4_v6	; r6 = OC_C7S1; r7 = OC_C1S7
  1184 	PKHBT	r12,r12,r5, LSL #16	; r12= <t[1,0]|t[0,0]>
  1185 	SMULWT  r5, r7, r4		; r5 = t[1,7]=OC_C1S7*x[1,1]>>16
  1186 	SMULWT  r7, r7, r10		; r7 = t[0,7]=OC_C1S7*x[0,1]>>16
  1187 	SMULWT	r10,r6, r10		; r10= t[0,4]=OC_C7S1*x[0,1]>>16
  1188 	PKHBT	r7, r7, r5, LSL #16	; r7 = <t[1,7]|t[0,7]>
  1189 	SMULWT	r4, r6, r4		; r4 = t[1,4]=OC_C7S1*x[1,1]>>16
  1190 ; Stage 2:
  1191 	SSUB16	r6, r7, r9		; r6 = t[7]-t[6]
  1192 	PKHBT	r4, r10,r4, LSL #16	; r4 = <t[1,4]|t[0,4]>
  1193 	SADD16	r7, r7, r9		; r7 = t[7]=t[7]+t[6]
  1194 	SMULWT	r9, r11,r6		; r9 = t[1,6]=OC_C4S4*r6T>>16
  1195 	SADD16	r5, r4, r8		; r5 = t[4]-t[5]
  1196 	SMULWB	r6, r11,r6		; r6 = t[0,6]=OC_C4S4*r6B>>16
  1197 	SSUB16	r4, r4, r8		; r4 = t[4]=t[4]+t[5]
  1198 	SMULWT	r10,r11,r5		; r10= t[1,5]=OC_C4S4*r5T>>16
  1199 	PKHBT	r6, r6, r9, LSL #16	; r6 = <t[1,6]|t[0,6]>
  1200 	SMULWB	r5, r11,r5		; r5 = t[0,5]=OC_C4S4*r5B>>16
  1201 ; Stage 3:
  1202 idct4_3core_stage3_v6
  1203 	SADD16	r11,r12,r2		; r11= t[1]=t[0]+t[2]
  1204 	PKHBT	r10,r5, r10,LSL #16	; r10= <t[1,5]|t[0,5]>
  1205 	SSUB16	r2, r12,r2		; r2 = t[2]=t[0]-t[2]
  1206 idct4_3core_stage3_5_v6
  1207 	SSUB16	r5, r6, r10		; r5 = t[5]'=t[6]-t[5]
  1208 	SADD16	r6, r6, r10		; r6 = t[6]=t[6]+t[5]
  1209 	SADD16	r10,r12,r3		; r10= t[0]'=t[0]+t[3]
  1210 	SSUB16	r3, r12,r3		; r3 = t[3]=t[0]-t[3]
  1211 ; Stage 4:
  1212 	SADD16	r12,r10,r7		; r12= t[0]+t[7]
  1213 	STR	r12,[r0], #4		; y[0<<3] = t[0]+t[7]
  1214 	SADD16	r12,r11,r6		; r12= t[1]+t[6]
  1215 	STR	r12,[r0, #12]		; y[1<<3] = t[1]+t[6]
  1216 	SADD16	r12,r2, r5		; r12= t[2]+t[5]
  1217 	STR	r12,[r0, #28]		; y[2<<3] = t[2]+t[5]
  1218 	SADD16	r12,r3, r4		; r12= t[3]+t[4]
  1219 	STR	r12,[r0, #44]		; y[3<<3] = t[3]+t[4]
  1220 	SSUB16	r4, r3, r4		; r4 = t[3]-t[4]
  1221 	STR	r4, [r0, #60]		; y[4<<3] = t[3]-t[4]
  1222 	SSUB16	r5, r2, r5		; r5 = t[2]-t[5]
  1223 	STR	r5, [r0, #76]		; y[5<<3] = t[2]-t[5]
  1224 	SSUB16	r6, r11,r6		; r6 = t[1]-t[6]
  1225 	STR	r6, [r0, #92]		; y[6<<3] = t[1]-t[6]
  1226 	SSUB16	r7, r10,r7		; r7 = t[0]-t[7]
  1227 	STR	r7, [r0, #108]		; y[7<<3] = t[0]-t[7]
  1228 	MOV	PC,r14
  1229 	ENDP
  1231 ; Another copy so the LDRD offsets are less than +/- 255.
  1232 	ALIGN 8
  1233 OC_C7S1_4_v6
  1234 	DCD	12785 ; 31F1
  1235 OC_C1S7_4_v6
  1236 	DCD	64277 ; FB15
  1237 OC_C6S2_4_v6
  1238 	DCD	25080 ; 61F8
  1239 OC_C2S6_4_v6
  1240 	DCD	60547 ; EC83
  1241 OC_C5S3_4_v6
  1242 	DCD	36410 ; 8E3A
  1243 OC_C3S5_4_v6
  1244 	DCD	54491 ; D4DB
  1246 idct4_4core_down_v6 PROC
  1247 	; r0 =       ogg_int16_t *_y (destination)
  1248 	; r1 = const ogg_int16_t *_x (source)
  1249 ; Stage 1:
  1250 	LDRD	r10,[r1], #16	; r10= <x[0,1]|x[0,0]>; r11= <x[0,3]|x[0,2]>
  1251 	LDRD	r2, OC_C5S3_4_v6	; r2 = OC_C5S3; r3 = OC_C3S5
  1252 	LDRD	r4, [r1], #16	; r4 = <x[1,1]|x[1,0]>; r5 = <x[1,3]|x[1,2]>
  1253 	SMULWT	r9, r3, r11		; r9 = t[0,6]=OC_C3S5*x[0,3]>>16
  1254 	LDRD	r6, OC_C6S2_4_v6	; r6 = OC_C6S2; r7 = OC_C2S6
  1255 	SMULWT	r8, r2, r11		; r8 = -t[0,5]=OC_C5S3*x[0,3]>>16
  1256 ; Here we cheat: row 3 had just a DC, so x[0,3]==x[1,3] by definition.
  1257 	PKHBT	r9, r9, r9, LSL #16	; r9 = <t[0,6]|t[0,6]>
  1258 	SMULWB	r3, r7, r11		; r3 = t[0,3]=OC_C2S6*x[0,2]>>16
  1259 	PKHBT	r8, r8, r8, LSL #16	; r8 = <-t[0,5]|-t[0,5]>
  1260 	SMULWB	r2, r6, r11		; r2 = t[0,2]=OC_C6S2*x[0,2]>>16
  1261 	LDR	r11,OC_C4S4
  1262 	SMULWB	r12,r7, r5		; r12= t[1,3]=OC_C2S6*x[1,2]>>16
  1263 	MOV	r7,#8
  1264 	SMULWB	r5, r6, r5		; r5 = t[1,2]=OC_C6S2*x[1,2]>>16
  1265 	PKHBT	r3, r3, r12,LSL #16	; r3 = <t[1,3]|t[0,3]>
  1266 	SMLAWB	r12,r11,r10,r7		; r12= t[0,0]+8=(OC_C4S4*x[0,0]>>16)+8
  1267 	PKHBT	r2, r2, r5, LSL #16	; r2 = <t[1,2]|t[0,2]>
  1268 	SMLAWB	r5, r11,r4 ,r7		; r5 = t[1,0]+8=(OC_C4S4*x[1,0]>>16)+8
  1269 	LDRD	r6, OC_C7S1_4_v6	; r6 = OC_C7S1; r7 = OC_C1S7
  1270 	PKHBT	r12,r12,r5, LSL #16	; r12= <t[1,0]+8|t[0,0]+8>
  1271 	SMULWT  r5, r7, r4		; r5 = t[1,7]=OC_C1S7*x[1,1]>>16
  1272 	SMULWT  r7, r7, r10		; r7 = t[0,7]=OC_C1S7*x[0,1]>>16
  1273 	SMULWT	r10,r6, r10		; r10= t[0,4]=OC_C7S1*x[0,1]>>16
  1274 	PKHBT	r7, r7, r5, LSL #16	; r7 = <t[1,7]|t[0,7]>
  1275 	SMULWT	r4, r6, r4		; r4 = t[1,4]=OC_C7S1*x[1,1]>>16
  1276 ; Stage 2:
  1277 	SSUB16	r6, r7, r9		; r6 = t[7]-t[6]
  1278 	PKHBT	r4, r10,r4, LSL #16	; r4 = <t[1,4]|t[0,4]>
  1279 	SADD16	r7, r7, r9		; r7 = t[7]=t[7]+t[6]
  1280 	SMULWT	r9, r11,r6		; r9 = t[1,6]=OC_C4S4*r6T>>16
  1281 	SADD16	r5, r4, r8		; r5 = t[4]-t[5]
  1282 	SMULWB	r6, r11,r6		; r6 = t[0,6]=OC_C4S4*r6B>>16
  1283 	SSUB16	r4, r4, r8		; r4 = t[4]=t[4]+t[5]
  1284 	SMULWT	r10,r11,r5		; r10= t[1,5]=OC_C4S4*r5T>>16
  1285 	PKHBT	r6, r6, r9, LSL #16	; r6 = <t[1,6]|t[0,6]>
  1286 	SMULWB	r5, r11,r5		; r5 = t[0,5]=OC_C4S4*r5B>>16
  1287 ; Stage 3:
  1288 idct4_4core_down_stage3_v6
  1289 	SADD16	r11,r12,r2		; r11= t[1]+8=t[0]+t[2]+8
  1290 	PKHBT	r10,r5, r10,LSL #16	; r10= <t[1,5]|t[0,5]>
  1291 	SSUB16	r2, r12,r2		; r2 = t[2]+8=t[0]-t[2]+8
  1292 	B	idct8_8core_down_stage3_5_v6
  1293 	ENDP
  1295 idct8_8core_v6 PROC
  1296 	STMFD	r13!,{r0,r14}
  1297 ; Stage 1:
  1298 	;5-6 rotation by 3pi/16
  1299 	LDRD	r10,OC_C5S3_4_v6	; r10= OC_C5S3, r11= OC_C3S5
  1300 	LDR	r4, [r1,#8]		; r4 = <x[0,5]|x[0,4]>
  1301 	LDR	r7, [r1,#24]		; r7 = <x[1,5]|x[1,4]>
  1302 	SMULWT	r5, r11,r4		; r5 = OC_C3S5*x[0,5]>>16
  1303 	LDR	r0, [r1,#4]		; r0 = <x[0,3]|x[0,2]>
  1304 	SMULWT	r3, r11,r7		; r3 = OC_C3S5*x[1,5]>>16
  1305 	LDR	r12,[r1,#20]		; r12= <x[1,3]|x[1,2]>
  1306 	SMULWT	r6, r11,r0		; r6 = OC_C3S5*x[0,3]>>16
  1307 	SMULWT	r11,r11,r12		; r11= OC_C3S5*x[1,3]>>16
  1308 	SMLAWT	r6, r10,r4, r6		; r6 = t[0,6]=r6+(OC_C5S3*x[0,5]>>16)
  1309 	PKHBT	r5, r5, r3, LSL #16	; r5 = <r3|r5>
  1310 	SMLAWT	r11,r10,r7, r11		; r11= t[1,6]=r11+(OC_C5S3*x[1,5]>>16)
  1311 	PKHBT	r4, r4, r7, LSL #16	; r4 = <x[1,4]|x[0,4]>
  1312 	SMULWT	r3, r10,r0		; r3 = OC_C5S3*x[0,3]>>16
  1313 	PKHBT	r6, r6, r11,LSL #16	; r6 = <t[1,6]|t[0,6]>
  1314 	SMULWT	r8, r10,r12		; r8 = OC_C5S3*x[1,3]>>16
  1315 	;2-3 rotation by 6pi/16
  1316 	LDRD	r10,OC_C6S2_4_v6	; r10= OC_C6S2, r11= OC_C2S6
  1317 	PKHBT	r3, r3, r8, LSL #16	; r3 = <r8|r3>
  1318 	LDR	r8, [r1,#12]		; r8 = <x[0,7]|x[0,6]>
  1319 	SMULWB	r2, r10,r0		; r2 = OC_C6S2*x[0,2]>>16
  1320 	SSUB16	r5, r5, r3		; r5 = <t[1,5]|t[0,5]>
  1321 	SMULWB	r9, r10,r12		; r9 = OC_C6S2*x[1,2]>>16
  1322 	LDR	r7, [r1,#28]		; r7 = <x[1,7]|x[1,6]>
  1323 	SMULWB	r3, r10,r8		; r3 = OC_C6S2*x[0,6]>>16
  1324 	SMULWB	r10,r10,r7		; r10= OC_C6S2*x[1,6]>>16
  1325 	PKHBT	r2, r2, r9, LSL #16	; r2 = <r2|r9>
  1326 	SMLAWB	r3, r11,r0, r3		; r3 = t[0,3]=r3+(OC_C2S6*x[0,2]>>16)
  1327 	SMLAWB	r10,r11,r12,r10		; r10= t[1,3]=r10+(OC_C2S6*x[1,2]>>16)
  1328 	SMULWB	r9, r11,r8		; r9 = OC_C2S6*x[0,6]>>16
  1329 	PKHBT	r3, r3, r10,LSL #16	; r3 = <t[1,6]|t[0,6]>
  1330 	SMULWB	r12,r11,r7		; r12= OC_C2S6*x[1,6]>>16
  1331 	;4-7 rotation by 7pi/16
  1332 	LDRD	r10,OC_C7S1_8_v6	; r10= OC_C7S1, r11= OC_C1S7
  1333 	PKHBT	r9, r9, r12,LSL #16	; r9 = <r9|r12>
  1334 	LDR	r0, [r1],#16		; r0 = <x[0,1]|x[0,0]>
  1335 	PKHTB	r7, r7, r8, ASR #16	; r7 = <x[1,7]|x[0,7]>
  1336 	SSUB16	r2, r2, r9		; r2 = <t[1,2]|t[0,2]>
  1337 	SMULWB	r9, r10,r7		; r9 = OC_C7S1*x[0,7]>>16
  1338 	LDR	r14,[r1],#16		; r14= <x[1,1]|x[1,0]>
  1339 	SMULWT	r12,r10,r7		; r12= OC_C7S1*x[1,7]>>16
  1340 	SMULWT	r8, r10,r0		; r8 = OC_C7S1*x[0,1]>>16
  1341 	SMULWT	r10,r10,r14		; r10= OC_C7S1*x[1,1]>>16
  1342 	SMLAWT	r9, r11,r0, r9		; r9 = t[0,7]=r9+(OC_C1S7*x[0,1]>>16)
  1343 	PKHBT	r8, r8, r10,LSL #16	; r8 = <r12|r8>
  1344 	SMLAWT	r12,r11,r14,r12		; r12= t[1,7]=r12+(OC_C1S7*x[1,1]>>16)
  1345 	PKHBT	r0, r0, r14,LSL #16	; r0 = <x[1,0]|x[0,0]>
  1346 	SMULWB	r10,r11,r7		; r10= OC_C1S7*x[0,6]>>16
  1347 	PKHBT	r9, r9, r12,LSL #16	; r9 = <t[1,7]|t[0,7]>
  1348 	SMULWT	r12,r11,r7		; r12= OC_C1S7*x[1,6]>>16
  1349 	;0-1 butterfly
  1350 	LDR	r11,OC_C4S4
  1351 	PKHBT	r10,r10,r12,LSL #16	; r10= <r12|r10>
  1352 	SADD16	r7, r0, r4		; r7 = x[0]+x[4]
  1353 	SSUB16	r10,r8, r10		; r10= <t[1,4]|t[0,4]>
  1354 	SSUB16	r4, r0, r4		; r4 = x[0]-x[4]
  1355 	SMULWB	r8, r11,r7		; r8 = t[0,0]=OC_C4S4*r7B>>16
  1356 	SMULWT	r12,r11,r7		; r12= t[1,0]=OC_C4S4*r7T>>16
  1357 	SMULWB	r7, r11,r4		; r7 = t[0,1]=OC_C4S4*r4B>>16
  1358 	PKHBT	r12,r8, r12,LSL #16	; r12= <t[1,0]|t[0,0]>
  1359 	SMULWT	r8, r11,r4		; r8 = t[1,1]=OC_C4S4*r4T>>16
  1360 ; Stage 2:
  1361 	SADD16	r4, r10,r5		; r4 = t[4]'=t[4]+t[5]
  1362 	PKHBT	r8, r7, r8, LSL #16	; r8 = <t[1,0]|t[0,0]>
  1363 	SSUB16	r5, r10,r5		; r5 = t[4]-t[5]
  1364 	SMULWB	r10,r11,r5		; r10= t[0,5]=OC_C4S4*r5B>>16
  1365 	SADD16	r7, r9, r6		; r7 = t[7]'=t[7]+t[6]
  1366 	SMULWT	r5, r11,r5		; r5 = t[1,5]=OC_C4S4*r5T>>16
  1367 	SSUB16	r6, r9, r6		; r6 = t[7]-t[6]
  1368 	SMULWB	r9, r11,r6		; r9 = t[0,6]=OC_C4S4*r6B>>16
  1369 	PKHBT	r10,r10,r5, LSL #16	; r10= <t[1,5]|t[0,5]>
  1370 	SMULWT	r6, r11,r6		; r6 = t[1,6]=OC_C4S4*r6T>>16
  1371 ; Stage 3:
  1372 	SADD16	r11,r8, r2		; r11= t[1]'=t[1]+t[2]
  1373 	PKHBT	r6, r9, r6, LSL #16	; r6 = <t[1,6]|t[0,6]>
  1374 	SSUB16	r2, r8, r2		; r2 = t[2]=t[1]-t[2]
  1375 	LDMFD	r13!,{r0,r14}
  1376 	B	idct4_3core_stage3_5_v6
  1377 	ENDP
  1379 ; Another copy so the LDRD offsets are less than +/- 255.
  1380 	ALIGN 8
  1381 OC_C7S1_8_v6
  1382 	DCD	12785 ; 31F1
  1383 OC_C1S7_8_v6
  1384 	DCD	64277 ; FB15
  1385 OC_C6S2_8_v6
  1386 	DCD	25080 ; 61F8
  1387 OC_C2S6_8_v6
  1388 	DCD	60547 ; EC83
  1389 OC_C5S3_8_v6
  1390 	DCD	36410 ; 8E3A
  1391 OC_C3S5_8_v6
  1392 	DCD	54491 ; D4DB
  1394 idct8_8core_down_v6 PROC
  1395 	STMFD	r13!,{r0,r14}
  1396 ; Stage 1:
  1397 	;5-6 rotation by 3pi/16
  1398 	LDRD	r10,OC_C5S3_8_v6	; r10= OC_C5S3, r11= OC_C3S5
  1399 	LDR	r4, [r1,#8]		; r4 = <x[0,5]|x[0,4]>
  1400 	LDR	r7, [r1,#24]		; r7 = <x[1,5]|x[1,4]>
  1401 	SMULWT	r5, r11,r4		; r5 = OC_C3S5*x[0,5]>>16
  1402 	LDR	r0, [r1,#4]		; r0 = <x[0,3]|x[0,2]>
  1403 	SMULWT	r3, r11,r7		; r3 = OC_C3S5*x[1,5]>>16
  1404 	LDR	r12,[r1,#20]		; r12= <x[1,3]|x[1,2]>
  1405 	SMULWT	r6, r11,r0		; r6 = OC_C3S5*x[0,3]>>16
  1406 	SMULWT	r11,r11,r12		; r11= OC_C3S5*x[1,3]>>16
  1407 	SMLAWT	r6, r10,r4, r6		; r6 = t[0,6]=r6+(OC_C5S3*x[0,5]>>16)
  1408 	PKHBT	r5, r5, r3, LSL #16	; r5 = <r3|r5>
  1409 	SMLAWT	r11,r10,r7, r11		; r11= t[1,6]=r11+(OC_C5S3*x[1,5]>>16)
  1410 	PKHBT	r4, r4, r7, LSL #16	; r4 = <x[1,4]|x[0,4]>
  1411 	SMULWT	r3, r10,r0		; r3 = OC_C5S3*x[0,3]>>16
  1412 	PKHBT	r6, r6, r11,LSL #16	; r6 = <t[1,6]|t[0,6]>
  1413 	SMULWT	r8, r10,r12		; r8 = OC_C5S3*x[1,3]>>16
  1414 	;2-3 rotation by 6pi/16
  1415 	LDRD	r10,OC_C6S2_8_v6	; r10= OC_C6S2, r11= OC_C2S6
  1416 	PKHBT	r3, r3, r8, LSL #16	; r3 = <r8|r3>
  1417 	LDR	r8, [r1,#12]		; r8 = <x[0,7]|x[0,6]>
  1418 	SMULWB	r2, r10,r0		; r2 = OC_C6S2*x[0,2]>>16
  1419 	SSUB16	r5, r5, r3		; r5 = <t[1,5]|t[0,5]>
  1420 	SMULWB	r9, r10,r12		; r9 = OC_C6S2*x[1,2]>>16
  1421 	LDR	r7, [r1,#28]		; r7 = <x[1,7]|x[1,6]>
  1422 	SMULWB	r3, r10,r8		; r3 = OC_C6S2*x[0,6]>>16
  1423 	SMULWB	r10,r10,r7		; r10= OC_C6S2*x[1,6]>>16
  1424 	PKHBT	r2, r2, r9, LSL #16	; r2 = <r2|r9>
  1425 	SMLAWB	r3, r11,r0, r3		; r3 = t[0,3]=r3+(OC_C2S6*x[0,2]>>16)
  1426 	SMLAWB	r10,r11,r12,r10		; r10= t[1,3]=r10+(OC_C2S6*x[1,2]>>16)
  1427 	SMULWB	r9, r11,r8		; r9 = OC_C2S6*x[0,6]>>16
  1428 	PKHBT	r3, r3, r10,LSL #16	; r3 = <t[1,6]|t[0,6]>
  1429 	SMULWB	r12,r11,r7		; r12= OC_C2S6*x[1,6]>>16
  1430 	;4-7 rotation by 7pi/16
  1431 	LDRD	r10,OC_C7S1_8_v6	; r10= OC_C7S1, r11= OC_C1S7
  1432 	PKHBT	r9, r9, r12,LSL #16	; r9 = <r9|r12>
  1433 	LDR	r0, [r1],#16		; r0 = <x[0,1]|x[0,0]>
  1434 	PKHTB	r7, r7, r8, ASR #16	; r7 = <x[1,7]|x[0,7]>
  1435 	SSUB16	r2, r2, r9		; r2 = <t[1,2]|t[0,2]>
  1436 	SMULWB	r9, r10,r7		; r9 = OC_C7S1*x[0,7]>>16
  1437 	LDR	r14,[r1],#16		; r14= <x[1,1]|x[1,0]>
  1438 	SMULWT	r12,r10,r7		; r12= OC_C7S1*x[1,7]>>16
  1439 	SMULWT	r8, r10,r0		; r8 = OC_C7S1*x[0,1]>>16
  1440 	SMULWT	r10,r10,r14		; r10= OC_C7S1*x[1,1]>>16
  1441 	SMLAWT	r9, r11,r0, r9		; r9 = t[0,7]=r9+(OC_C1S7*x[0,1]>>16)
  1442 	PKHBT	r8, r8, r10,LSL #16	; r8 = <r12|r8>
  1443 	SMLAWT	r12,r11,r14,r12		; r12= t[1,7]=r12+(OC_C1S7*x[1,1]>>16)
  1444 	PKHBT	r0, r0, r14,LSL #16	; r0 = <x[1,0]|x[0,0]>
  1445 	SMULWB	r10,r11,r7		; r10= OC_C1S7*x[0,6]>>16
  1446 	PKHBT	r9, r9, r12,LSL #16	; r9 = <t[1,7]|t[0,7]>
  1447 	SMULWT	r12,r11,r7		; r12= OC_C1S7*x[1,6]>>16
  1448 	;0-1 butterfly
  1449 	LDR	r11,OC_C4S4
  1450 	MOV	r14,#8
  1451 	PKHBT	r10,r10,r12,LSL #16	; r10= <r12|r10>
  1452 	SADD16	r7, r0, r4		; r7 = x[0]+x[4]
  1453 	SSUB16	r10,r8, r10		; r10= <t[1,4]|t[0,4]>
  1454 	SMLAWB	r8, r11,r7, r14		; r8 = t[0,0]+8=(OC_C4S4*r7B>>16)+8
  1455 	SSUB16	r4, r0, r4		; r4 = x[0]-x[4]
  1456 	SMLAWT	r12,r11,r7, r14		; r12= t[1,0]+8=(OC_C4S4*r7T>>16)+8
  1457 	SMLAWB	r7, r11,r4, r14		; r7 = t[0,1]+8=(OC_C4S4*r4B>>16)+8
  1458 	PKHBT	r12,r8, r12,LSL #16	; r12= <t[1,0]+8|t[0,0]+8>
  1459 	SMLAWT	r8, r11,r4, r14		; r8 = t[1,1]+8=(OC_C4S4*r4T>>16)+8
  1460 ; Stage 2:
  1461 	SADD16	r4, r10,r5		; r4 = t[4]'=t[4]+t[5]
  1462 	PKHBT	r8, r7, r8, LSL #16	; r8 = <t[1,0]+8|t[0,0]+8>
  1463 	SSUB16	r5, r10,r5		; r5 = t[4]-t[5]
  1464 	SMULWB	r10,r11,r5		; r10= t[0,5]=OC_C4S4*r5B>>16
  1465 	SADD16	r7, r9, r6		; r7 = t[7]'=t[7]+t[6]
  1466 	SMULWT	r5, r11,r5		; r5 = t[1,5]=OC_C4S4*r5T>>16
  1467 	SSUB16	r6, r9, r6		; r6 = t[7]-t[6]
  1468 	SMULWB	r9, r11,r6		; r9 = t[0,6]=OC_C4S4*r6B>>16
  1469 	PKHBT	r10,r10,r5, LSL #16	; r10= <t[1,5]|t[0,5]>
  1470 	SMULWT	r6, r11,r6		; r6 = t[1,6]=OC_C4S4*r6T>>16
  1471 ; Stage 3:
  1472 	SADD16	r11,r8, r2		; r11= t[1]'+8=t[1]+t[2]+8
  1473 	PKHBT	r6, r9, r6, LSL #16	; r6 = <t[1,6]|t[0,6]>
  1474 	SSUB16	r2, r8, r2		; r2 = t[2]+8=t[1]-t[2]+8
  1475 	LDMFD	r13!,{r0,r14}
  1476 idct8_8core_down_stage3_5_v6
  1477 	SSUB16	r5, r6, r10		; r5 = t[5]'=t[6]-t[5]
  1478 	SADD16	r6, r6, r10		; r6 = t[6]=t[6]+t[5]
  1479 	SADD16	r10,r12,r3		; r10= t[0]'+8=t[0]+t[3]+8
  1480 	SSUB16	r3, r12,r3		; r3 = t[3]+8=t[0]-t[3]+8
  1481 ; Stage 4:
  1482 	SADD16	r12,r10,r7		; r12= t[0]+t[7]+8
  1483 	SSUB16	r7, r10,r7		; r7 = t[0]-t[7]+8
  1484 	MOV	r10,r12,ASR #4
  1485 	MOV	r12,r12,LSL #16
  1486 	PKHTB	r10,r10,r12,ASR #20	; r10= t[0]+t[7]+8>>4
  1487 	STR	r10,[r0], #4		; y[0<<3] = t[0]+t[7]+8>>4
  1488 	SADD16	r12,r11,r6		; r12= t[1]+t[6]+8
  1489 	SSUB16	r6, r11,r6		; r6 = t[1]-t[6]+8
  1490 	MOV	r10,r12,ASR #4
  1491 	MOV	r12,r12,LSL #16
  1492 	PKHTB	r10,r10,r12,ASR #20	; r10= t[1]+t[6]+8>>4
  1493 	STR	r10,[r0, #12]		; y[1<<3] = t[1]+t[6]+8>>4
  1494 	SADD16	r12,r2, r5		; r12= t[2]+t[5]+8
  1495 	SSUB16	r5, r2, r5		; r5 = t[2]-t[5]+8
  1496 	MOV	r10,r12,ASR #4
  1497 	MOV	r12,r12,LSL #16
  1498 	PKHTB	r10,r10,r12,ASR #20	; r10= t[2]+t[5]+8>>4
  1499 	STR	r10,[r0, #28]		; y[2<<3] = t[2]+t[5]+8>>4
  1500 	SADD16	r12,r3, r4		; r12= t[3]+t[4]+8
  1501 	SSUB16	r4, r3, r4		; r4 = t[3]-t[4]+8
  1502 	MOV	r10,r12,ASR #4
  1503 	MOV	r12,r12,LSL #16
  1504 	PKHTB	r10,r10,r12,ASR #20	; r10= t[3]+t[4]+8>>4
  1505 	STR	r10,[r0, #44]		; y[3<<3] = t[3]+t[4]+8>>4
  1506 	MOV	r10,r4, ASR #4
  1507 	MOV	r4, r4, LSL #16
  1508 	PKHTB	r10,r10,r4, ASR #20	; r10= t[3]-t[4]+8>>4
  1509 	STR	r10,[r0, #60]		; y[4<<3] = t[3]-t[4]+8>>4
  1510 	MOV	r10,r5, ASR #4
  1511 	MOV	r5, r5, LSL #16
  1512 	PKHTB	r10,r10,r5, ASR #20	; r10= t[2]-t[5]+8>>4
  1513 	STR	r10,[r0, #76]		; y[5<<3] = t[2]-t[5]+8>>4
  1514 	MOV	r10,r6, ASR #4
  1515 	MOV	r6, r6, LSL #16
  1516 	PKHTB	r10,r10,r6, ASR #20	; r10= t[1]-t[6]+8>>4
  1517 	STR	r10,[r0, #92]		; y[6<<3] = t[1]-t[6]+8>>4
  1518 	MOV	r10,r7, ASR #4
  1519 	MOV	r7, r7, LSL #16
  1520 	PKHTB	r10,r10,r7, ASR #20	; r10= t[0]-t[7]+8>>4
  1521 	STR	r10,[r0, #108]		; y[7<<3] = t[0]-t[7]+8>>4
  1522 	MOV	PC,r14
  1523 	ENDP
  1526  [ OC_ARM_ASM_NEON
  1527 	EXPORT	oc_idct8x8_1_neon
  1528 	EXPORT	oc_idct8x8_neon
  1530 	ALIGN 16
  1531 OC_IDCT_CONSTS_NEON
  1532 	DCW	    8
  1533 	DCW	64277 ; FB15 (C1S7)
  1534 	DCW	60547 ; EC83 (C2S6)
  1535 	DCW	54491 ; D4DB (C3S5)
  1536 	DCW	46341 ; B505 (C4S4)
  1537 	DCW	36410 ; 471D (C5S3)
  1538 	DCW	25080 ; 30FC (C6S2)
  1539 	DCW	12785 ; 31F1 (C7S1)
  1541 oc_idct8x8_1_neon PROC
  1542 	; r0 = ogg_int16_t  *_y
  1543 	; r1 = ogg_uint16_t  _dc
  1544 	VDUP.S16	Q0, r1
  1545 	VMOV		Q1, Q0
  1546 	VST1.64		{D0, D1, D2, D3}, [r0@128]!
  1547 	VST1.64		{D0, D1, D2, D3}, [r0@128]!
  1548 	VST1.64		{D0, D1, D2, D3}, [r0@128]!
  1549 	VST1.64		{D0, D1, D2, D3}, [r0@128]
  1550 	MOV	PC, r14
  1551 	ENDP
  1553 oc_idct8x8_neon PROC
  1554 	; r0 = ogg_int16_t *_y
  1555 	; r1 = ogg_int16_t *_x
  1556 	; r2 = int          _last_zzi
  1557 	CMP	r2, #10
  1558 	BLE	oc_idct8x8_10_neon
  1559 oc_idct8x8_slow_neon
  1560 	VPUSH		{D8-D15}
  1561 	MOV	r2, r1
  1562 	ADR	r3, OC_IDCT_CONSTS_NEON
  1563 	; Row transforms (input is pre-transposed)
  1564 	VLD1.64		{D16,D17,D18,D19}, [r2@128]!
  1565 	VLD1.64		{D20,D21,D22,D23}, [r2@128]!
  1566 	VLD1.64		{D24,D25,D26,D27}, [r2@128]!
  1567 	VSUB.S16	Q1, Q8, Q12	; Q8 = x[0]-x[4]
  1568 	VLD1.64		{D28,D29,D30,D31}, [r2@128]
  1569 	VADD.S16	Q8, Q8, Q12	; Q1 = x[0]+x[4]
  1570 	VLD1.64		{D0,D1},           [r3@128]
  1571 	MOV	r12, r14
  1572 	BL	oc_idct8x8_stage123_neon
  1573 ; Stage 4
  1574 	VSUB.S16	Q15,Q8, Q7	; Q15 = y[7]=t[0]'-t[7]'
  1575 	VADD.S16	Q8, Q8, Q7	; Q8  = y[0]=t[0]'+t[7]'
  1576 	VSUB.S16	Q14,Q9, Q3	; Q14 = y[6]=t[1]'-t[6]''
  1577 	VADD.S16	Q9, Q9, Q3	; Q9  = y[1]=t[1]'+t[6]''
  1578 	VSUB.S16	Q13,Q10,Q5	; Q13 = y[5]=t[2]'-t[5]''
  1579 	VADD.S16	Q10,Q10,Q5	; Q10 = y[2]=t[2]'+t[5]''
  1580 	VTRN.16		Q14,Q15
  1581 	VSUB.S16	Q12,Q11,Q4	; Q12 = y[4]=t[3]'-t[4]'
  1582 	VADD.S16	Q11,Q11,Q4	; Q11 = y[3]=t[3]'+t[4]'
  1583 	; 8x8 Transpose
  1584 	VTRN.16		Q8, Q9
  1585 	VTRN.16		Q10,Q11
  1586 	VTRN.16		Q12,Q13
  1587 	VTRN.32		Q8, Q10
  1588 	VTRN.32		Q9, Q11
  1589 	VTRN.32		Q12,Q14
  1590 	VTRN.32		Q13,Q15
  1591 	VSWP		D17,D24
  1592 	VSUB.S16	Q1, Q8, Q12	; Q8 = x[0]-x[4]
  1593 	VSWP		D19,D26
  1594 	VADD.S16	Q8, Q8, Q12	; Q1 = x[0]+x[4]
  1595 	VSWP		D21,D28
  1596 	VSWP		D23,D30
  1597 	; Column transforms
  1598 	BL	oc_idct8x8_stage123_neon
  1599 	CMP	r0,r1
  1600 	; We have to put the return address back in the LR, or the branch
  1601 	;  predictor will not recognize the function return and mis-predict the
  1602 	;  entire call stack.
  1603 	MOV	r14, r12
  1604 ; Stage 4
  1605 	VSUB.S16	Q15,Q8, Q7	; Q15 = y[7]=t[0]'-t[7]'
  1606 	VADD.S16	Q8, Q8, Q7	; Q8  = y[0]=t[0]'+t[7]'
  1607 	VSUB.S16	Q14,Q9, Q3	; Q14 = y[6]=t[1]'-t[6]''
  1608 	VADD.S16	Q9, Q9, Q3	; Q9  = y[1]=t[1]'+t[6]''
  1609 	VSUB.S16	Q13,Q10,Q5	; Q13 = y[5]=t[2]'-t[5]''
  1610 	VADD.S16	Q10,Q10,Q5	; Q10 = y[2]=t[2]'+t[5]''
  1611 	VSUB.S16	Q12,Q11,Q4	; Q12 = y[4]=t[3]'-t[4]'
  1612 	VADD.S16	Q11,Q11,Q4	; Q11 = y[3]=t[3]'+t[4]'
  1613 	BEQ		oc_idct8x8_slow_neon_noclear
  1614 	VMOV.I8		Q2,#0
  1615 	VPOP		{D8-D15}
  1616 	VMOV.I8		Q3,#0
  1617 	VRSHR.S16	Q8, Q8, #4	; Q8  = y[0]+8>>4
  1618 	VST1.64		{D4, D5, D6, D7}, [r1@128]!
  1619 	VRSHR.S16	Q9, Q9, #4	; Q9  = y[1]+8>>4
  1620 	VRSHR.S16	Q10,Q10,#4	; Q10 = y[2]+8>>4
  1621 	VST1.64		{D4, D5, D6, D7}, [r1@128]!
  1622 	VRSHR.S16	Q11,Q11,#4	; Q11 = y[3]+8>>4
  1623 	VRSHR.S16	Q12,Q12,#4	; Q12 = y[4]+8>>4
  1624 	VST1.64		{D4, D5, D6, D7}, [r1@128]!
  1625 	VRSHR.S16	Q13,Q13,#4	; Q13 = y[5]+8>>4
  1626 	VRSHR.S16	Q14,Q14,#4	; Q14 = y[6]+8>>4
  1627 	VST1.64		{D4, D5, D6, D7}, [r1@128]
  1628 	VRSHR.S16	Q15,Q15,#4	; Q15 = y[7]+8>>4
  1629 	VSTMIA		r0, {D16-D31}
  1630 	MOV	PC, r14
  1632 oc_idct8x8_slow_neon_noclear
  1633 	VPOP		{D8-D15}
  1634 	VRSHR.S16	Q8, Q8, #4	; Q8  = y[0]+8>>4
  1635 	VRSHR.S16	Q9, Q9, #4	; Q9  = y[1]+8>>4
  1636 	VRSHR.S16	Q10,Q10,#4	; Q10 = y[2]+8>>4
  1637 	VRSHR.S16	Q11,Q11,#4	; Q11 = y[3]+8>>4
  1638 	VRSHR.S16	Q12,Q12,#4	; Q12 = y[4]+8>>4
  1639 	VRSHR.S16	Q13,Q13,#4	; Q13 = y[5]+8>>4
  1640 	VRSHR.S16	Q14,Q14,#4	; Q14 = y[6]+8>>4
  1641 	VRSHR.S16	Q15,Q15,#4	; Q15 = y[7]+8>>4
  1642 	VSTMIA		r0, {D16-D31}
  1643 	MOV	PC, r14
  1644 	ENDP
  1646 oc_idct8x8_stage123_neon PROC
  1647 ; Stages 1 & 2
  1648 	VMULL.S16	Q4, D18,D1[3]
  1649 	VMULL.S16	Q5, D19,D1[3]
  1650 	VMULL.S16	Q7, D30,D1[3]
  1651 	VMULL.S16	Q6, D31,D1[3]
  1652 	VMULL.S16	Q2, D30,D0[1]
  1653 	VMULL.S16	Q3, D31,D0[1]
  1654 	VSHRN.S32	D8, Q4, #16
  1655 	VSHRN.S32	D9, Q5, #16	; Q4 = (OC_C7S1*x[1]>>16)
  1656 	VSHRN.S32	D14,Q7, #16
  1657 	VSHRN.S32	D15,Q6, #16	; Q7 = (OC_C7S1*x[7]>>16)
  1658 	VSHRN.S32	D4, Q2, #16
  1659 	VSHRN.S32	D5, Q3, #16	; Q2 = (OC_C1S7*x[7]>>16)-x[7]
  1660 	VSUB.S16	Q4, Q4, Q15
  1661 	VADD.S16	Q7, Q7, Q9
  1662 	VSUB.S16	Q4, Q4, Q2	; Q4 = t[4]
  1663 	VMULL.S16	Q2, D18,D0[1]
  1664 	VMULL.S16	Q9, D19,D0[1]
  1665 	VMULL.S16	Q5, D26,D0[3]
  1666 	VMULL.S16	Q3, D27,D0[3]
  1667 	VMULL.S16	Q6, D22,D0[3]
  1668 	VMULL.S16	Q12,D23,D0[3]
  1669 	VSHRN.S32	D4, Q2, #16
  1670 	VSHRN.S32	D5, Q9, #16	; Q2 = (OC_C1S7*x[1]>>16)-x[1]
  1671 	VSHRN.S32	D10,Q5, #16
  1672 	VSHRN.S32	D11,Q3, #16	; Q5 = (OC_C3S5*x[5]>>16)-x[5]
  1673 	VSHRN.S32	D12,Q6, #16
  1674 	VSHRN.S32	D13,Q12,#16	; Q6 = (OC_C3S5*x[3]>>16)-x[3]
  1675 	VADD.S16	Q7, Q7, Q2	; Q7 = t[7]
  1676 	VSUB.S16	Q5, Q5, Q11
  1677 	VADD.S16	Q6, Q6, Q11
  1678 	VADD.S16	Q5, Q5, Q13
  1679 	VADD.S16	Q6, Q6, Q13
  1680 	VMULL.S16	Q9, D22,D1[1]
  1681 	VMULL.S16	Q11,D23,D1[1]
  1682 	VMULL.S16	Q15,D26,D1[1]
  1683 	VMULL.S16	Q13,D27,D1[1]
  1684 	VMULL.S16	Q2, D20,D1[2]
  1685 	VMULL.S16	Q12,D21,D1[2]
  1686 	VSHRN.S32	D18,Q9, #16
  1687 	VSHRN.S32	D19,Q11,#16	; Q9 = (OC_C5S3*x[3]>>16)-x[3]
  1688 	VSHRN.S32	D30,Q15,#16
  1689 	VSHRN.S32	D31,Q13,#16	; Q15= (OC_C5S3*x[5]>>16)-x[5]
  1690 	VSHRN.S32	D4, Q2, #16
  1691 	VSHRN.S32	D5, Q12,#16	; Q2 = (OC_C6S2*x[2]>>16)
  1692 	VSUB.S16	Q5, Q5, Q9	; Q5 = t[5]
  1693 	VADD.S16	Q6, Q6, Q15	; Q6 = t[6]
  1694 	VSUB.S16	Q2, Q2, Q14
  1695 	VMULL.S16	Q3, D28,D1[2]
  1696 	VMULL.S16	Q11,D29,D1[2]
  1697 	VMULL.S16	Q12,D28,D0[2]
  1698 	VMULL.S16	Q9, D29,D0[2]
  1699 	VMULL.S16	Q13,D20,D0[2]
  1700 	VMULL.S16	Q15,D21,D0[2]
  1701 	VSHRN.S32	D6, Q3, #16
  1702 	VSHRN.S32	D7, Q11,#16	; Q3 = (OC_C6S2*x[6]>>16)
  1703 	VSHRN.S32	D24,Q12,#16
  1704 	VSHRN.S32	D25,Q9, #16	; Q12= (OC_C2S6*x[6]>>16)-x[6]
  1705 	VSHRN.S32	D26,Q13,#16
  1706 	VSHRN.S32	D27,Q15,#16	; Q13= (OC_C2S6*x[2]>>16)-x[2]
  1707 	VSUB.S16	Q9, Q4, Q5	; Q9 = t[4]-t[5]
  1708 	VSUB.S16	Q11,Q7, Q6	; Q11= t[7]-t[6]
  1709 	VADD.S16	Q3, Q3, Q10
  1710 	VADD.S16	Q4, Q4, Q5	; Q4 = t[4]'=t[4]+t[5]
  1711 	VADD.S16	Q7, Q7, Q6	; Q7 = t[7]'=t[7]+t[6]
  1712 	VSUB.S16	Q2, Q2, Q12	; Q2 = t[2]
  1713 	VADD.S16	Q3, Q3, Q13	; Q3 = t[3]
  1714 	VMULL.S16	Q12,D16,D1[0]
  1715 	VMULL.S16	Q13,D17,D1[0]
  1716 	VMULL.S16	Q14,D2, D1[0]
  1717 	VMULL.S16	Q15,D3, D1[0]
  1718 	VMULL.S16	Q5, D18,D1[0]
  1719 	VMULL.S16	Q6, D22,D1[0]
  1720 	VSHRN.S32	D24,Q12,#16
  1721 	VSHRN.S32	D25,Q13,#16
  1722 	VSHRN.S32	D28,Q14,#16
  1723 	VSHRN.S32	D29,Q15,#16
  1724 	VMULL.S16	Q13,D19,D1[0]
  1725 	VMULL.S16	Q15,D23,D1[0]
  1726 	VADD.S16	Q8, Q8, Q12	; Q8 = t[0]
  1727 	VADD.S16	Q1, Q1, Q14	; Q1 = t[1]
  1728 	VSHRN.S32	D10,Q5, #16
  1729 	VSHRN.S32	D12,Q6, #16
  1730 	VSHRN.S32	D11,Q13,#16
  1731 	VSHRN.S32	D13,Q15,#16
  1732 	VADD.S16	Q5, Q5, Q9	; Q5 = t[5]'=OC_C4S4*(t[4]-t[5])>>16
  1733 	VADD.S16	Q6, Q6, Q11	; Q6 = t[6]'=OC_C4S4*(t[7]-t[6])>>16
  1734 ; Stage 3
  1735 	VSUB.S16	Q11,Q8, Q3	; Q11 = t[3]''=t[0]-t[3]
  1736 	VADD.S16	Q8, Q8, Q3	; Q8  = t[0]''=t[0]+t[3]
  1737 	VADD.S16	Q9, Q1, Q2	; Q9  = t[1]''=t[1]+t[2]
  1738 	VADD.S16	Q3, Q6, Q5	; Q3  = t[6]''=t[6]'+t[5]'
  1739 	VSUB.S16	Q10,Q1, Q2	; Q10 = t[2]''=t[1]-t[2]
  1740 	VSUB.S16	Q5, Q6, Q5	; Q5  = t[5]''=t[6]'-t[5]'
  1741 	MOV	PC, r14
  1742 	ENDP
  1744 oc_idct8x8_10_neon PROC
  1745 	ADR	r3, OC_IDCT_CONSTS_NEON
  1746 	VLD1.64		{D0,D1},          [r3@128]
  1747 	MOV	r2, r1
  1748 	; Row transforms (input is pre-transposed)
  1749 ; Stage 1
  1750 	VLD1.64		{D16,D17,D18,D19},[r2@128]!
  1751 	MOV	r12, #16
  1752 	VMULL.S16	Q15,D16,D1[0]	; Q15= OC_C4S4*x[0]-(x[0]<<16)
  1753 	VLD1.64		{D17},            [r2@64], r12
  1754 	VMULL.S16	Q2, D18,D0[1]	; Q2 = OC_C1S7*x[1]-(x[1]<<16)
  1755 	VLD1.64		{D19},            [r2@64]
  1756 	VMULL.S16	Q14,D17,D0[2]	; Q14= OC_C2S6*x[2]-(x[2]<<16)
  1757 	VMULL.S16	Q3, D19,D0[3]	; Q3 = OC_C3S5*x[3]-(x[3]<<16)
  1758 	VMULL.S16	Q13,D19,D1[1]	; Q13= OC_C5S3*x[3]-(x[3]<<16)
  1759 	VMULL.S16	Q12,D18,D1[3]	; Q12= OC_C7S1*x[1]
  1760 	VMULL.S16	Q1, D17,D1[2]	; Q1 = OC_C6S2*x[2]
  1761 	VSHRN.S32	D30,Q15,#16	; D30= t[0]-x[0]
  1762 	VSHRN.S32	D4, Q2, #16	; D4 = t[7]-x[1]
  1763 	VSHRN.S32	D31,Q14,#16	; D31= t[3]-x[2]
  1764 	VSHRN.S32	D6, Q3, #16	; D6 = t[6]-x[3]
  1765 	VSHRN.S32	D7, Q13,#16	; D7 = -t[5]-x[3]
  1766 	VSHRN.S32	D5, Q12,#16	; D5 = t[4]
  1767 	VSHRN.S32	D2, Q1, #16	; D2 = t[2]
  1768 	VADD.S16	D4, D4, D18	; D4 = t[7]
  1769 	VADD.S16	D6, D6, D19	; D6 = t[6]
  1770 	VADD.S16	D7, D7, D19	; D7 = -t[5]
  1771 	VADD.S16	Q15,Q15,Q8	; D30= t[0]
  1772 					; D31= t[3]
  1773 ; Stages 2 & 3
  1774 	VSUB.S16	Q12,Q2, Q3	; D24= t[7]-t[6]
  1775 					; D25= t[4]'=t[4]+t[5]
  1776 	VADD.S16	Q13,Q2, Q3	; D26= t[7]'=t[7]+t[6]
  1777 					; D27= t[4]-t[5]
  1778 	VMULL.S16	Q11,D24,D1[0]	; Q11= OC_C4S4*(t[7]-t[6])
  1779 					;       -(t[7]-t[6]<<16)
  1780 	VMULL.S16	Q14,D27,D1[0]	; Q14= OC_C4S4*(t[4]-t[5])
  1781 					;       -(t[4]-t[5]<<16)
  1782 	VADD.S16	D16,D30,D31	; D16= t[0]'=t[0]+t[3]
  1783 	VSUB.S16	D17,D30,D2	; D17= t[2]'=t[0]-t[2]
  1784 	VADD.S16	D18,D30,D2	; D18= t[1]'=t[0]+t[2]
  1785 	VSHRN.S32	D22,Q11,#16	; D22= (OC_C4S4*(t[7]-t[6])>>16)
  1786 					;       -(t[7]-t[6])
  1787 	VSHRN.S32	D23,Q14,#16	; D23= (OC_C4S4*(t[4]-t[5])>>16)
  1788 					;       -(t[4]-t[5])
  1789 	VSUB.S16	D19,D30,D31	; D19= t[3]'=t[0]-t[3]
  1790 	VADD.S16	D22,D22,D24	; D22= t[6]'=OC_C4S4*(t[7]-t[6])>>16
  1791 	VADD.S16	D23,D23,D27	; D23= t[5]'=OC_C4S4*(t[4]-t[5])>>16
  1792 	VSUB.S16	D27,D22,D23	; D27= t[5]''=t[6]'-t[5]'
  1793 	VADD.S16	D24,D22,D23	; D24= t[6]''=t[6]'+t[5]'
  1794 ; Stage 4
  1795 	VSUB.S16	Q11,Q8, Q13	; D22= y[7]=t[0]'-t[7]'
  1796 					; D23= y[5]=t[2]'-t[5]''
  1797 	VSUB.S16	Q10,Q9, Q12	; D20= y[6]=t[1]'-t[6]'
  1798 					; D21= y[4]=t[3]'-t[4]''
  1799 	VADD.S16	Q8, Q8, Q13	; D16= y[0]=t[0]'+t[7]'
  1800 					; D17= y[2]=t[2]'+t[5]''
  1801 	VADD.S16	Q9, Q9, Q12	; D18= y[1]=t[1]'-t[6]'
  1802 					; D19= y[3]=t[3]'-t[4]''
  1803 	; 8x4 transpose
  1804 	VTRN.16		Q10,Q11		; Q10= c5c4a5a4 c7c6a7a6
  1805 					; Q11= d5d4b5b4 d7d6b7b6
  1806 	VTRN.16		Q8, Q9		; Q8 = c3c2a3a2 c1c0a1a0
  1807 					; Q9 = d3d2b3b2 d1d0b1b0
  1808 	VSWP		D20,D21		; Q10= c7c6a7a6 c5c4a5a4
  1809 	VSWP		D22,D23		; Q11= d7d6b7b6 d5d4b5b4
  1810 	VUZP.32		Q9, Q11		; Q9 = b7b6b5b4 b3b2b1b0
  1811 					; Q11= d7d6d5d4 d3d2d1d0
  1812 	VMULL.S16	Q15,D18,D0[1]
  1813 	VMULL.S16	Q13,D22,D1[1]
  1814 	VUZP.32		Q8, Q10		; Q8 = a7a6a5a4 a3a2a1a0
  1815 					; Q10= c7c6c5c4 c3c2c1c0
  1816 	; Column transforms
  1817 ; Stages 1, 2, & 3
  1818 	VMULL.S16	Q14,D19,D0[1]	; Q14:Q15= OC_C1S7*x[1]-(x[1]<<16)
  1819 	VMULL.S16	Q12,D23,D1[1]	; Q12:Q13= OC_C5S3*x[3]-(x[3]<<16)
  1820 	VMULL.S16	Q3, D22,D0[3]
  1821 	VMULL.S16	Q2, D23,D0[3]	;  Q2:Q3 = OC_C3S5*x[3]-(x[3]<<16)
  1822 	VSHRN.S32	D30,Q15,#16
  1823 	VSHRN.S32	D31,Q14,#16	; Q15= (OC_C1S7*x[1]>>16)-x[1]
  1824 	VSHRN.S32	D26,Q13,#16
  1825 	VSHRN.S32	D27,Q12,#16	; Q13= (OC_C5S3*x[3]>>16)-x[3]
  1826 	VSHRN.S32	D28,Q3, #16
  1827 	VSHRN.S32	D29,Q2, #16	; Q14= (OC_C3S5*x[3]>>16)-x[3]
  1828 	VADD.S16	Q15,Q15,Q9	; Q15= t[7]
  1829 	VADD.S16	Q13,Q13,Q11	; Q13= -t[5]
  1830 	VADD.S16	Q14,Q14,Q11	; Q14= t[6]
  1831 	VMULL.S16	Q12,D18,D1[3]
  1832 	VMULL.S16	Q2, D19,D1[3]	;  Q2:Q12= OC_C7S1*x[1]
  1833 	VMULL.S16	Q1, D16,D1[0]
  1834 	VMULL.S16	Q11,D17,D1[0]	; Q11:Q1 = OC_C4S4*x[0]-(x[0]<<16)
  1835 	VMULL.S16	Q3, D20,D0[2]
  1836 	VMULL.S16	Q9, D21,D0[2]	;  Q9:Q3 = OC_C2S6*x[2]-(x[2]<<16)
  1837 	VSHRN.S32	D24,Q12,#16
  1838 	VSHRN.S32	D25,Q2, #16	; Q12= t[4]
  1839 	VMULL.S16	Q2, D20,D1[2]
  1840 	VSHRN.S32	D2, Q1, #16
  1841 	VSHRN.S32	D3, Q11,#16	; Q1 = (OC_C4S4*x[0]>>16)-x[0]
  1842 	VMULL.S16	Q11,D21,D1[2]	;  Q2:Q11= OC_C6S2*x[2]
  1843 	VSHRN.S32	D6, Q3, #16
  1844 	VSHRN.S32	D7, Q9, #16	; Q3 = (OC_C2S6*x[2]>>16)-x[2]
  1845 	VSUB.S16	Q9, Q15,Q14	; Q9 = t[7]-t[6]
  1846 	VADD.S16	Q15,Q15,Q14	; Q15= t[7]'=t[7]+t[6]
  1847 	VSHRN.S32	D4, Q2, #16
  1848 	VSHRN.S32	D5, Q11,#16	; Q2 = t[2]
  1849 	VADD.S16	Q1, Q1, Q8	; Q1 = t[0]
  1850 	VADD.S16	Q8, Q12,Q13	; Q8 = t[4]-t[5]
  1851 	VADD.S16	Q3, Q3, Q10	; Q3 = t[3]
  1852 	VMULL.S16	Q10,D16,D1[0]
  1853 	VMULL.S16	Q11,D17,D1[0]	; Q11:Q10= OC_C4S4*(t[4]-t[5])
  1854 					;           -(t[4]-t[5]<<16)
  1855 	VSUB.S16	Q12,Q12,Q13	; Q12= t[4]'=t[4]+t[5]
  1856 	VMULL.S16	Q14,D18,D1[0]
  1857 	VMULL.S16	Q13,D19,D1[0]	; Q13:Q14= OC_C4S4*(t[6]-t[7])
  1858 					;           -(t[6]-t[7]<<16)
  1859 	VSHRN.S32	D20,Q10,#16
  1860 	VSHRN.S32	D21,Q11,#16	; Q10= (OC_C4S4*(t[4]-t[5])>>16)
  1861 					;       -(t[4]-t[5])
  1862 	VADD.S16	Q11,Q1, Q3	; Q11= t[0]'=t[0]+t[3]
  1863 	VSUB.S16	Q3, Q1, Q3	; Q3 = t[3]'=t[0]-t[3]
  1864 	VSHRN.S32	D28,Q14,#16
  1865 	VSHRN.S32	D29,Q13,#16	; Q14= (OC_C4S4*(t[7]-t[6])>>16)
  1866 					;       -(t[7]-t[6])
  1867 	VADD.S16	Q10,Q10,Q8	; Q10=t[5]'
  1868 	VADD.S16	Q14,Q14,Q9	; Q14=t[6]'
  1869 	VSUB.S16	Q13,Q14,Q10	; Q13=t[5]''=t[6]'-t[5]'
  1870 	VADD.S16	Q14,Q14,Q10	; Q14=t[6]''=t[6]'+t[5]'
  1871 	VADD.S16	Q10,Q1, Q2	; Q10= t[1]'=t[0]+t[2]
  1872 	VSUB.S16	Q2, Q1, Q2	; Q2 = t[2]'=t[0]-t[2]
  1873 ; Stage 4
  1874 	CMP	r0, r1
  1875 	VADD.S16	Q8, Q11,Q15	; Q8  = y[0]=t[0]'+t[7]'
  1876 	VADD.S16	Q9, Q10,Q14	; Q9  = y[1]=t[1]'+t[6]''
  1877 	VSUB.S16	Q15,Q11,Q15	; Q15 = y[7]=t[0]'-t[7]'
  1878 	VSUB.S16	Q14,Q10,Q14	; Q14 = y[6]=t[1]'-t[6]''
  1879 	VADD.S16	Q10,Q2, Q13	; Q10 = y[2]=t[2]'+t[5]''
  1880 	VADD.S16	Q11,Q3, Q12	; Q11 = y[3]=t[3]'+t[4]'
  1881 	VSUB.S16	Q12,Q3, Q12	; Q12 = y[4]=t[3]'-t[4]'
  1882 	VSUB.S16	Q13,Q2, Q13	; Q13 = y[5]=t[2]'-t[5]''
  1883 	BEQ	oc_idct8x8_10_neon_noclear
  1884 	VMOV.I8		D2, #0
  1885 	VRSHR.S16	Q8, Q8, #4	; Q8  = y[0]+8>>4
  1886 	VST1.64		{D2}, [r1@64], r12
  1887 	VRSHR.S16	Q9, Q9, #4	; Q9  = y[1]+8>>4
  1888 	VRSHR.S16	Q10,Q10,#4	; Q10 = y[2]+8>>4
  1889 	VST1.64		{D2}, [r1@64], r12
  1890 	VRSHR.S16	Q11,Q11,#4	; Q11 = y[3]+8>>4
  1891 	VRSHR.S16	Q12,Q12,#4	; Q12 = y[4]+8>>4
  1892 	VST1.64		{D2}, [r1@64], r12
  1893 	VRSHR.S16	Q13,Q13,#4	; Q13 = y[5]+8>>4
  1894 	VRSHR.S16	Q14,Q14,#4	; Q14 = y[6]+8>>4
  1895 	VST1.64		{D2}, [r1@64]
  1896 	VRSHR.S16	Q15,Q15,#4	; Q15 = y[7]+8>>4
  1897 	VSTMIA		r0, {D16-D31}
  1898 	MOV	PC, r14
  1900 oc_idct8x8_10_neon_noclear
  1901 	VRSHR.S16	Q8, Q8, #4	; Q8  = y[0]+8>>4
  1902 	VRSHR.S16	Q9, Q9, #4	; Q9  = y[1]+8>>4
  1903 	VRSHR.S16	Q10,Q10,#4	; Q10 = y[2]+8>>4
  1904 	VRSHR.S16	Q11,Q11,#4	; Q11 = y[3]+8>>4
  1905 	VRSHR.S16	Q12,Q12,#4	; Q12 = y[4]+8>>4
  1906 	VRSHR.S16	Q13,Q13,#4	; Q13 = y[5]+8>>4
  1907 	VRSHR.S16	Q14,Q14,#4	; Q14 = y[6]+8>>4
  1908 	VRSHR.S16	Q15,Q15,#4	; Q15 = y[7]+8>>4
  1909 	VSTMIA		r0, {D16-D31}
  1910 	MOV	PC, r14
  1911 	ENDP
  1914 	END

mercurial