media/libtheora/lib/arm/armidct.s

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

michael@0 1 ;********************************************************************
michael@0 2 ;* *
michael@0 3 ;* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
michael@0 4 ;* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
michael@0 5 ;* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
michael@0 6 ;* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
michael@0 7 ;* *
michael@0 8 ;* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010 *
michael@0 9 ;* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
michael@0 10 ;* *
michael@0 11 ;********************************************************************
michael@0 12 ; Original implementation:
michael@0 13 ; Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd
michael@0 14 ; last mod: $Id: armidct.s 17481 2010-10-03 22:49:42Z tterribe $
michael@0 15 ;********************************************************************
michael@0 16
michael@0 17 AREA |.text|, CODE, READONLY
michael@0 18
michael@0 19 ; Explicitly specifying alignment here because some versions of
michael@0 20 ; gas don't align code correctly. See
michael@0 21 ; http://lists.gnu.org/archive/html/bug-binutils/2011-06/msg00199.html
michael@0 22 ; https://bugzilla.mozilla.org/show_bug.cgi?id=920992
michael@0 23 ALIGN
michael@0 24
michael@0 25 GET armopts.s
michael@0 26
michael@0 27 EXPORT oc_idct8x8_1_arm
michael@0 28 EXPORT oc_idct8x8_arm
michael@0 29
michael@0 30 oc_idct8x8_1_arm PROC
michael@0 31 ; r0 = ogg_int16_t *_y
michael@0 32 ; r1 = ogg_uint16_t _dc
michael@0 33 ORR r1, r1, r1, LSL #16
michael@0 34 MOV r2, r1
michael@0 35 MOV r3, r1
michael@0 36 MOV r12,r1
michael@0 37 STMIA r0!,{r1,r2,r3,r12}
michael@0 38 STMIA r0!,{r1,r2,r3,r12}
michael@0 39 STMIA r0!,{r1,r2,r3,r12}
michael@0 40 STMIA r0!,{r1,r2,r3,r12}
michael@0 41 STMIA r0!,{r1,r2,r3,r12}
michael@0 42 STMIA r0!,{r1,r2,r3,r12}
michael@0 43 STMIA r0!,{r1,r2,r3,r12}
michael@0 44 STMIA r0!,{r1,r2,r3,r12}
michael@0 45 MOV PC, r14
michael@0 46 ENDP
michael@0 47
michael@0 48 oc_idct8x8_arm PROC
michael@0 49 ; r0 = ogg_int16_t *_y
michael@0 50 ; r1 = ogg_int16_t *_x
michael@0 51 ; r2 = int _last_zzi
michael@0 52 CMP r2, #3
michael@0 53 BLE oc_idct8x8_3_arm
michael@0 54 CMP r2, #6
michael@0 55 BLE oc_idct8x8_6_arm
michael@0 56 CMP r2, #10
michael@0 57 BLE oc_idct8x8_10_arm
michael@0 58 oc_idct8x8_slow_arm
michael@0 59 STMFD r13!,{r4-r11,r14}
michael@0 60 SUB r13,r13,#64*2
michael@0 61 ; Row transforms
michael@0 62 STR r0, [r13,#-4]!
michael@0 63 ADD r0, r13, #4 ; Write to temp storage.
michael@0 64 BL idct8core_arm
michael@0 65 BL idct8core_arm
michael@0 66 BL idct8core_arm
michael@0 67 BL idct8core_arm
michael@0 68 BL idct8core_arm
michael@0 69 BL idct8core_arm
michael@0 70 BL idct8core_arm
michael@0 71 BL idct8core_arm
michael@0 72 LDR r0, [r13], #4 ; Write to the final destination.
michael@0 73 ; Clear input data for next block (decoder only).
michael@0 74 SUB r2, r1, #8*16
michael@0 75 CMP r0, r2
michael@0 76 MOV r1, r13 ; And read from temp storage.
michael@0 77 BEQ oc_idct8x8_slow_arm_cols
michael@0 78 MOV r4, #0
michael@0 79 MOV r5, #0
michael@0 80 MOV r6, #0
michael@0 81 MOV r7, #0
michael@0 82 STMIA r2!,{r4,r5,r6,r7}
michael@0 83 STMIA r2!,{r4,r5,r6,r7}
michael@0 84 STMIA r2!,{r4,r5,r6,r7}
michael@0 85 STMIA r2!,{r4,r5,r6,r7}
michael@0 86 STMIA r2!,{r4,r5,r6,r7}
michael@0 87 STMIA r2!,{r4,r5,r6,r7}
michael@0 88 STMIA r2!,{r4,r5,r6,r7}
michael@0 89 STMIA r2!,{r4,r5,r6,r7}
michael@0 90 oc_idct8x8_slow_arm_cols
michael@0 91 ; Column transforms
michael@0 92 BL idct8core_down_arm
michael@0 93 BL idct8core_down_arm
michael@0 94 BL idct8core_down_arm
michael@0 95 BL idct8core_down_arm
michael@0 96 BL idct8core_down_arm
michael@0 97 BL idct8core_down_arm
michael@0 98 BL idct8core_down_arm
michael@0 99 BL idct8core_down_arm
michael@0 100 ADD r13,r13,#64*2
michael@0 101 LDMFD r13!,{r4-r11,PC}
michael@0 102 ENDP
michael@0 103
michael@0 104 oc_idct8x8_10_arm PROC
michael@0 105 STMFD r13!,{r4-r11,r14}
michael@0 106 SUB r13,r13,#64*2
michael@0 107 ; Row transforms
michael@0 108 MOV r2, r0
michael@0 109 MOV r0, r13 ; Write to temp storage.
michael@0 110 BL idct4core_arm
michael@0 111 BL idct3core_arm
michael@0 112 BL idct2core_arm
michael@0 113 BL idct1core_arm
michael@0 114 ; Clear input data for next block (decoder only).
michael@0 115 SUB r0, r1, #4*16
michael@0 116 CMP r0, r2
michael@0 117 MOV r1, r13 ; Read from temp storage.
michael@0 118 BEQ oc_idct8x8_10_arm_cols
michael@0 119 MOV r4, #0
michael@0 120 STR r4, [r0]
michael@0 121 STR r4, [r0,#4]
michael@0 122 STR r4, [r0,#16]
michael@0 123 STR r4, [r0,#20]
michael@0 124 STR r4, [r0,#32]
michael@0 125 STR r4, [r0,#48]
michael@0 126 MOV r0, r2 ; Write to the final destination
michael@0 127 oc_idct8x8_10_arm_cols
michael@0 128 ; Column transforms
michael@0 129 BL idct4core_down_arm
michael@0 130 BL idct4core_down_arm
michael@0 131 BL idct4core_down_arm
michael@0 132 BL idct4core_down_arm
michael@0 133 BL idct4core_down_arm
michael@0 134 BL idct4core_down_arm
michael@0 135 BL idct4core_down_arm
michael@0 136 BL idct4core_down_arm
michael@0 137 ADD r13,r13,#64*2
michael@0 138 LDMFD r13!,{r4-r11,PC}
michael@0 139 ENDP
michael@0 140
michael@0 141 oc_idct8x8_6_arm PROC
michael@0 142 STMFD r13!,{r4-r7,r9-r11,r14}
michael@0 143 SUB r13,r13,#64*2
michael@0 144 ; Row transforms
michael@0 145 MOV r2, r0
michael@0 146 MOV r0, r13 ; Write to temp storage.
michael@0 147 BL idct3core_arm
michael@0 148 BL idct2core_arm
michael@0 149 BL idct1core_arm
michael@0 150 ; Clear input data for next block (decoder only).
michael@0 151 SUB r0, r1, #3*16
michael@0 152 CMP r0, r2
michael@0 153 MOV r1, r13 ; Read from temp storage.
michael@0 154 BEQ oc_idct8x8_6_arm_cols
michael@0 155 MOV r4, #0
michael@0 156 STR r4, [r0]
michael@0 157 STR r4, [r0,#4]
michael@0 158 STR r4, [r0,#16]
michael@0 159 STR r4, [r0,#32]
michael@0 160 MOV r0, r2 ; Write to the final destination
michael@0 161 oc_idct8x8_6_arm_cols
michael@0 162 ; Column transforms
michael@0 163 BL idct3core_down_arm
michael@0 164 BL idct3core_down_arm
michael@0 165 BL idct3core_down_arm
michael@0 166 BL idct3core_down_arm
michael@0 167 BL idct3core_down_arm
michael@0 168 BL idct3core_down_arm
michael@0 169 BL idct3core_down_arm
michael@0 170 BL idct3core_down_arm
michael@0 171 ADD r13,r13,#64*2
michael@0 172 LDMFD r13!,{r4-r7,r9-r11,PC}
michael@0 173 ENDP
michael@0 174
michael@0 175 oc_idct8x8_3_arm PROC
michael@0 176 STMFD r13!,{r4-r7,r9-r11,r14}
michael@0 177 SUB r13,r13,#64*2
michael@0 178 ; Row transforms
michael@0 179 MOV r2, r0
michael@0 180 MOV r0, r13 ; Write to temp storage.
michael@0 181 BL idct2core_arm
michael@0 182 BL idct1core_arm
michael@0 183 ; Clear input data for next block (decoder only).
michael@0 184 SUB r0, r1, #2*16
michael@0 185 CMP r0, r2
michael@0 186 MOV r1, r13 ; Read from temp storage.
michael@0 187 MOVNE r4, #0
michael@0 188 STRNE r4, [r0]
michael@0 189 STRNE r4, [r0,#16]
michael@0 190 MOVNE r0, r2 ; Write to the final destination
michael@0 191 ; Column transforms
michael@0 192 BL idct2core_down_arm
michael@0 193 BL idct2core_down_arm
michael@0 194 BL idct2core_down_arm
michael@0 195 BL idct2core_down_arm
michael@0 196 BL idct2core_down_arm
michael@0 197 BL idct2core_down_arm
michael@0 198 BL idct2core_down_arm
michael@0 199 BL idct2core_down_arm
michael@0 200 ADD r13,r13,#64*2
michael@0 201 LDMFD r13!,{r4-r7,r9-r11,PC}
michael@0 202 ENDP
michael@0 203
michael@0 204 idct1core_arm PROC
michael@0 205 ; r0 = ogg_int16_t *_y (destination)
michael@0 206 ; r1 = const ogg_int16_t *_x (source)
michael@0 207 LDRSH r3, [r1], #16
michael@0 208 MOV r12,#0x05
michael@0 209 ORR r12,r12,#0xB500
michael@0 210 MUL r3, r12, r3
michael@0 211 ; Stall ?
michael@0 212 MOV r3, r3, ASR #16
michael@0 213 STRH r3, [r0], #2
michael@0 214 STRH r3, [r0, #14]
michael@0 215 STRH r3, [r0, #30]
michael@0 216 STRH r3, [r0, #46]
michael@0 217 STRH r3, [r0, #62]
michael@0 218 STRH r3, [r0, #78]
michael@0 219 STRH r3, [r0, #94]
michael@0 220 STRH r3, [r0, #110]
michael@0 221 MOV PC,R14
michael@0 222 ENDP
michael@0 223
michael@0 224 idct2core_arm PROC
michael@0 225 ; r0 = ogg_int16_t *_y (destination)
michael@0 226 ; r1 = const ogg_int16_t *_x (source)
michael@0 227 LDRSH r9, [r1], #16 ; r9 = x[0]
michael@0 228 LDR r12,OC_C4S4
michael@0 229 LDRSH r11,[r1, #-14] ; r11= x[1]
michael@0 230 LDR r3, OC_C7S1
michael@0 231 MUL r9, r12,r9 ; r9 = t[0]<<16 = OC_C4S4*x[0]
michael@0 232 LDR r10,OC_C1S7
michael@0 233 MUL r3, r11,r3 ; r3 = t[4]<<16 = OC_C7S1*x[1]
michael@0 234 MOV r9, r9, ASR #16 ; r9 = t[0]
michael@0 235 MUL r11,r10,r11 ; r11= t[7]<<16 = OC_C1S7*x[1]
michael@0 236 MOV r3, r3, ASR #16 ; r3 = t[4]
michael@0 237 MUL r10,r12,r3 ; r10= t[5]<<16 = OC_C4S4*t[4]
michael@0 238 MOV r11,r11,ASR #16 ; r11= t[7]
michael@0 239 MUL r12,r11,r12 ; r12= t[6]<<16 = OC_C4S4*t[7]
michael@0 240 MOV r10,r10,ASR #16 ; r10= t[5]
michael@0 241 ADD r12,r9,r12,ASR #16 ; r12= t[0]+t[6]
michael@0 242 ADD r12,r12,r10 ; r12= t[0]+t2[6] = t[0]+t[6]+t[5]
michael@0 243 SUB r10,r12,r10,LSL #1 ; r10= t[0]+t2[5] = t[0]+t[6]-t[5]
michael@0 244 ADD r3, r3, r9 ; r3 = t[0]+t[4]
michael@0 245 ADD r11,r11,r9 ; r11= t[0]+t[7]
michael@0 246 STRH r11,[r0], #2 ; y[0] = t[0]+t[7]
michael@0 247 STRH r12,[r0, #14] ; y[1] = t[0]+t[6]
michael@0 248 STRH r10,[r0, #30] ; y[2] = t[0]+t[5]
michael@0 249 STRH r3, [r0, #46] ; y[3] = t[0]+t[4]
michael@0 250 RSB r3, r3, r9, LSL #1 ; r3 = t[0]*2-(t[0]+t[4])=t[0]-t[4]
michael@0 251 RSB r10,r10,r9, LSL #1 ; r10= t[0]*2-(t[0]+t[5])=t[0]-t[5]
michael@0 252 RSB r12,r12,r9, LSL #1 ; r12= t[0]*2-(t[0]+t[6])=t[0]-t[6]
michael@0 253 RSB r11,r11,r9, LSL #1 ; r1 = t[0]*2-(t[0]+t[7])=t[0]-t[7]
michael@0 254 STRH r3, [r0, #62] ; y[4] = t[0]-t[4]
michael@0 255 STRH r10,[r0, #78] ; y[5] = t[0]-t[5]
michael@0 256 STRH r12,[r0, #94] ; y[6] = t[0]-t[6]
michael@0 257 STRH r11,[r0, #110] ; y[7] = t[0]-t[7]
michael@0 258 MOV PC,r14
michael@0 259 ENDP
michael@0 260
michael@0 261 idct2core_down_arm PROC
michael@0 262 ; r0 = ogg_int16_t *_y (destination)
michael@0 263 ; r1 = const ogg_int16_t *_x (source)
michael@0 264 LDRSH r9, [r1], #16 ; r9 = x[0]
michael@0 265 LDR r12,OC_C4S4
michael@0 266 LDRSH r11,[r1, #-14] ; r11= x[1]
michael@0 267 LDR r3, OC_C7S1
michael@0 268 MUL r9, r12,r9 ; r9 = t[0]<<16 = OC_C4S4*x[0]
michael@0 269 LDR r10,OC_C1S7
michael@0 270 MUL r3, r11,r3 ; r3 = t[4]<<16 = OC_C7S1*x[1]
michael@0 271 MOV r9, r9, ASR #16 ; r9 = t[0]
michael@0 272 MUL r11,r10,r11 ; r11= t[7]<<16 = OC_C1S7*x[1]
michael@0 273 ADD r9, r9, #8 ; r9 = t[0]+8
michael@0 274 MOV r3, r3, ASR #16 ; r3 = t[4]
michael@0 275 MUL r10,r12,r3 ; r10= t[5]<<16 = OC_C4S4*t[4]
michael@0 276 MOV r11,r11,ASR #16 ; r11= t[7]
michael@0 277 MUL r12,r11,r12 ; r12= t[6]<<16 = OC_C4S4*t[7]
michael@0 278 MOV r10,r10,ASR #16 ; r10= t[5]
michael@0 279 ADD r12,r9,r12,ASR #16 ; r12= t[0]+t[6]+8
michael@0 280 ADD r12,r12,r10 ; r12= t[0]+t2[6] = t[0]+t[6]+t[5]+8
michael@0 281 SUB r10,r12,r10,LSL #1 ; r10= t[0]+t2[5] = t[0]+t[6]-t[5]+8
michael@0 282 ADD r3, r3, r9 ; r3 = t[0]+t[4]+8
michael@0 283 ADD r11,r11,r9 ; r11= t[0]+t[7]+8
michael@0 284 ; TODO: This is wrong.
michael@0 285 ; The C code truncates to 16 bits by storing to RAM and doing the
michael@0 286 ; shifts later; we've got an extra 4 bits here.
michael@0 287 MOV r4, r11,ASR #4
michael@0 288 MOV r5, r12,ASR #4
michael@0 289 MOV r6, r10,ASR #4
michael@0 290 MOV r7, r3, ASR #4
michael@0 291 RSB r3, r3, r9, LSL #1 ;r3 =t[0]*2+8-(t[0]+t[4])=t[0]-t[4]+8
michael@0 292 RSB r10,r10,r9, LSL #1 ;r10=t[0]*2+8-(t[0]+t[5])=t[0]-t[5]+8
michael@0 293 RSB r12,r12,r9, LSL #1 ;r12=t[0]*2+8-(t[0]+t[6])=t[0]-t[6]+8
michael@0 294 RSB r11,r11,r9, LSL #1 ;r11=t[0]*2+8-(t[0]+t[7])=t[0]-t[7]+8
michael@0 295 MOV r3, r3, ASR #4
michael@0 296 MOV r10,r10,ASR #4
michael@0 297 MOV r12,r12,ASR #4
michael@0 298 MOV r11,r11,ASR #4
michael@0 299 STRH r4, [r0], #2 ; y[0] = t[0]+t[7]
michael@0 300 STRH r5, [r0, #14] ; y[1] = t[0]+t[6]
michael@0 301 STRH r6, [r0, #30] ; y[2] = t[0]+t[5]
michael@0 302 STRH r7, [r0, #46] ; y[3] = t[0]+t[4]
michael@0 303 STRH r3, [r0, #62] ; y[4] = t[0]-t[4]
michael@0 304 STRH r10,[r0, #78] ; y[5] = t[0]-t[5]
michael@0 305 STRH r12,[r0, #94] ; y[6] = t[0]-t[6]
michael@0 306 STRH r11,[r0, #110] ; y[7] = t[0]-t[7]
michael@0 307 MOV PC,r14
michael@0 308 ENDP
michael@0 309
michael@0 310 idct3core_arm PROC
michael@0 311 LDRSH r9, [r1], #16 ; r9 = x[0]
michael@0 312 LDR r12,OC_C4S4 ; r12= OC_C4S4
michael@0 313 LDRSH r3, [r1, #-12] ; r3 = x[2]
michael@0 314 LDR r10,OC_C6S2 ; r10= OC_C6S2
michael@0 315 MUL r9, r12,r9 ; r9 = t[0]<<16 = OC_C4S4*x[0]
michael@0 316 LDR r4, OC_C2S6 ; r4 = OC_C2S6
michael@0 317 MUL r10,r3, r10 ; r10= t[2]<<16 = OC_C6S2*x[2]
michael@0 318 LDRSH r11,[r1, #-14] ; r11= x[1]
michael@0 319 MUL r3, r4, r3 ; r3 = t[3]<<16 = OC_C2S6*x[2]
michael@0 320 LDR r4, OC_C7S1 ; r4 = OC_C7S1
michael@0 321 LDR r5, OC_C1S7 ; r5 = OC_C1S7
michael@0 322 MOV r9, r9, ASR #16 ; r9 = t[0]
michael@0 323 MUL r4, r11,r4 ; r4 = t[4]<<16 = OC_C7S1*x[1]
michael@0 324 ADD r3, r9, r3, ASR #16 ; r3 = t[0]+t[3]
michael@0 325 MUL r11,r5, r11 ; r11= t[7]<<16 = OC_C1S7*x[1]
michael@0 326 MOV r4, r4, ASR #16 ; r4 = t[4]
michael@0 327 MUL r5, r12,r4 ; r5 = t[5]<<16 = OC_C4S4*t[4]
michael@0 328 MOV r11,r11,ASR #16 ; r11= t[7]
michael@0 329 MUL r12,r11,r12 ; r12= t[6]<<16 = OC_C4S4*t[7]
michael@0 330 ADD r10,r9, r10,ASR #16 ; r10= t[1] = t[0]+t[2]
michael@0 331 RSB r6, r10,r9, LSL #1 ; r6 = t[2] = t[0]-t[2]
michael@0 332 ; r3 = t2[0] = t[0]+t[3]
michael@0 333 RSB r9, r3, r9, LSL #1 ; r9 = t2[3] = t[0]-t[3]
michael@0 334 MOV r12,r12,ASR #16 ; r12= t[6]
michael@0 335 ADD r5, r12,r5, ASR #16 ; r5 = t2[6] = t[6]+t[5]
michael@0 336 RSB r12,r5, r12,LSL #1 ; r12= t2[5] = t[6]-t[5]
michael@0 337 ADD r11,r3, r11 ; r11= t2[0]+t[7]
michael@0 338 ADD r5, r10,r5 ; r5 = t[1]+t2[6]
michael@0 339 ADD r12,r6, r12 ; r12= t[2]+t2[5]
michael@0 340 ADD r4, r9, r4 ; r4 = t2[3]+t[4]
michael@0 341 STRH r11,[r0], #2 ; y[0] = t[0]+t[7]
michael@0 342 STRH r5, [r0, #14] ; y[1] = t[1]+t2[6]
michael@0 343 STRH r12,[r0, #30] ; y[2] = t[2]+t2[5]
michael@0 344 STRH r4, [r0, #46] ; y[3] = t2[3]+t[4]
michael@0 345 RSB r11,r11,r3, LSL #1 ; r11= t2[0] - t[7]
michael@0 346 RSB r5, r5, r10,LSL #1 ; r5 = t[1] - t2[6]
michael@0 347 RSB r12,r12,r6, LSL #1 ; r6 = t[2] - t2[5]
michael@0 348 RSB r4, r4, r9, LSL #1 ; r4 = t2[3] - t[4]
michael@0 349 STRH r4, [r0, #62] ; y[4] = t2[3]-t[4]
michael@0 350 STRH r12,[r0, #78] ; y[5] = t[2]-t2[5]
michael@0 351 STRH r5, [r0, #94] ; y[6] = t[1]-t2[6]
michael@0 352 STRH r11,[r0, #110] ; y[7] = t2[0]-t[7]
michael@0 353 MOV PC,R14
michael@0 354 ENDP
michael@0 355
michael@0 356 idct3core_down_arm PROC
michael@0 357 LDRSH r9, [r1], #16 ; r9 = x[0]
michael@0 358 LDR r12,OC_C4S4 ; r12= OC_C4S4
michael@0 359 LDRSH r3, [r1, #-12] ; r3 = x[2]
michael@0 360 LDR r10,OC_C6S2 ; r10= OC_C6S2
michael@0 361 MUL r9, r12,r9 ; r9 = t[0]<<16 = OC_C4S4*x[0]
michael@0 362 LDR r4, OC_C2S6 ; r4 = OC_C2S6
michael@0 363 MUL r10,r3, r10 ; r10= t[2]<<16 = OC_C6S2*x[2]
michael@0 364 LDRSH r11,[r1, #-14] ; r11= x[1]
michael@0 365 MUL r3, r4, r3 ; r3 = t[3]<<16 = OC_C2S6*x[2]
michael@0 366 LDR r4, OC_C7S1 ; r4 = OC_C7S1
michael@0 367 LDR r5, OC_C1S7 ; r5 = OC_C1S7
michael@0 368 MOV r9, r9, ASR #16 ; r9 = t[0]
michael@0 369 MUL r4, r11,r4 ; r4 = t[4]<<16 = OC_C7S1*x[1]
michael@0 370 ADD r9, r9, #8 ; r9 = t[0]+8
michael@0 371 MUL r11,r5, r11 ; r11= t[7]<<16 = OC_C1S7*x[1]
michael@0 372 ADD r3, r9, r3, ASR #16 ; r3 = t[0]+t[3]+8
michael@0 373 MOV r4, r4, ASR #16 ; r4 = t[4]
michael@0 374 MUL r5, r12,r4 ; r5 = t[5]<<16 = OC_C4S4*t[4]
michael@0 375 MOV r11,r11,ASR #16 ; r11= t[7]
michael@0 376 MUL r12,r11,r12 ; r12= t[6]<<16 = OC_C4S4*t[7]
michael@0 377 ADD r10,r9, r10,ASR #16 ; r10= t[1]+8 = t[0]+t[2]+8
michael@0 378 RSB r6, r10,r9, LSL #1 ; r6 = t[2]+8 = t[0]-t[2]+8
michael@0 379 ; r3 = t2[0]+8 = t[0]+t[3]+8
michael@0 380 RSB r9, r3, r9, LSL #1 ; r9 = t2[3]+8 = t[0]-t[3]+8
michael@0 381 MOV r12,r12,ASR #16 ; r12= t[6]
michael@0 382 ADD r5, r12,r5, ASR #16 ; r5 = t2[6] = t[6]+t[5]
michael@0 383 RSB r12,r5, r12,LSL #1 ; r12= t2[5] = t[6]-t[5]
michael@0 384 ADD r11,r3, r11 ; r11= t2[0]+t[7] +8
michael@0 385 ADD r5, r10,r5 ; r5 = t[1] +t2[6]+8
michael@0 386 ADD r12,r6, r12 ; r12= t[2] +t2[5]+8
michael@0 387 ADD r4, r9, r4 ; r4 = t2[3]+t[4] +8
michael@0 388 RSB r3, r11,r3, LSL #1 ; r11= t2[0] - t[7] + 8
michael@0 389 RSB r10,r5, r10,LSL #1 ; r5 = t[1] - t2[6] + 8
michael@0 390 RSB r6, r12,r6, LSL #1 ; r6 = t[2] - t2[5] + 8
michael@0 391 RSB r9, r4, r9, LSL #1 ; r4 = t2[3] - t[4] + 8
michael@0 392 ; TODO: This is wrong.
michael@0 393 ; The C code truncates to 16 bits by storing to RAM and doing the
michael@0 394 ; shifts later; we've got an extra 4 bits here.
michael@0 395 MOV r11,r11,ASR #4
michael@0 396 MOV r5, r5, ASR #4
michael@0 397 MOV r12,r12,ASR #4
michael@0 398 MOV r4, r4, ASR #4
michael@0 399 MOV r9, r9, ASR #4
michael@0 400 MOV r6, r6, ASR #4
michael@0 401 MOV r10,r10,ASR #4
michael@0 402 MOV r3, r3, ASR #4
michael@0 403 STRH r11,[r0], #2 ; y[0] = t[0]+t[7]
michael@0 404 STRH r5, [r0, #14] ; y[1] = t[1]+t2[6]
michael@0 405 STRH r12,[r0, #30] ; y[2] = t[2]+t2[5]
michael@0 406 STRH r4, [r0, #46] ; y[3] = t2[3]+t[4]
michael@0 407 STRH r9, [r0, #62] ; y[4] = t2[3]-t[4]
michael@0 408 STRH r6, [r0, #78] ; y[5] = t[2]-t2[5]
michael@0 409 STRH r10,[r0, #94] ; y[6] = t[1]-t2[6]
michael@0 410 STRH r3, [r0, #110] ; y[7] = t2[0]-t[7]
michael@0 411 MOV PC,R14
michael@0 412 ENDP
michael@0 413
michael@0 414 idct4core_arm PROC
michael@0 415 ; r0 = ogg_int16_t *_y (destination)
michael@0 416 ; r1 = const ogg_int16_t *_x (source)
michael@0 417 LDRSH r9, [r1], #16 ; r9 = x[0]
michael@0 418 LDR r10,OC_C4S4 ; r10= OC_C4S4
michael@0 419 LDRSH r12,[r1, #-12] ; r12= x[2]
michael@0 420 LDR r4, OC_C6S2 ; r4 = OC_C6S2
michael@0 421 MUL r9, r10,r9 ; r9 = t[0]<<16 = OC_C4S4*x[0]
michael@0 422 LDR r5, OC_C2S6 ; r5 = OC_C2S6
michael@0 423 MUL r4, r12,r4 ; r4 = t[2]<<16 = OC_C6S2*x[2]
michael@0 424 LDRSH r3, [r1, #-14] ; r3 = x[1]
michael@0 425 MUL r5, r12,r5 ; r5 = t[3]<<16 = OC_C2S6*x[2]
michael@0 426 LDR r6, OC_C7S1 ; r6 = OC_C7S1
michael@0 427 LDR r12,OC_C1S7 ; r12= OC_C1S7
michael@0 428 LDRSH r11,[r1, #-10] ; r11= x[3]
michael@0 429 MUL r6, r3, r6 ; r6 = t[4]<<16 = OC_C7S1*x[1]
michael@0 430 LDR r7, OC_C5S3 ; r7 = OC_C5S3
michael@0 431 MUL r3, r12,r3 ; r3 = t[7]<<16 = OC_C1S7*x[1]
michael@0 432 LDR r8, OC_C3S5 ; r8 = OC_C3S5
michael@0 433 MUL r7, r11,r7 ; r7 = -t[5]<<16 = OC_C5S3*x[3]
michael@0 434 MOV r9, r9, ASR #16 ; r9 = t[0]
michael@0 435 MUL r11,r8, r11 ; r11= t[6]<<16 = OC_C3S5*x[3]
michael@0 436 MOV r6, r6, ASR #16 ; r6 = t[4]
michael@0 437 ; TODO: This is wrong; t[4]-t[5] and t[7]-t[6] need to be truncated to 16-bit
michael@0 438 ; before multiplying, not after (this is not equivalent)
michael@0 439 SUB r7, r6, r7, ASR #16 ; r7 = t2[4]=t[4]+t[5] (as r7=-t[5])
michael@0 440 RSB r6, r7, r6, LSL #1 ; r6 = t[4]-t[5]
michael@0 441 MUL r6, r10,r6 ; r6 = t2[5]<<16 =OC_C4S4*(t[4]-t[5])
michael@0 442 MOV r3, r3, ASR #16 ; r3 = t[7]
michael@0 443 ADD r11,r3, r11,ASR #16 ; r11= t2[7]=t[7]+t[6]
michael@0 444 RSB r3, r11,r3, LSL #1 ; r3 = t[7]-t[6]
michael@0 445 MUL r3, r10,r3 ; r3 = t2[6]<<16 =OC_C4S4*(t[7]-t[6])
michael@0 446 ADD r4, r9, r4, ASR #16 ; r4 = t[1] = t[0] + t[2]
michael@0 447 RSB r10,r4, r9, LSL #1 ; r10= t[2] = t[0] - t[2]
michael@0 448 ADD r5, r9, r5, ASR #16 ; r5 = t[0] = t[0] + t[3]
michael@0 449 RSB r9, r5, r9, LSL #1 ; r9 = t[3] = t[0] - t[3]
michael@0 450 MOV r3, r3, ASR #16 ; r3 = t2[6]
michael@0 451 ADD r6, r3, r6, ASR #16 ; r6 = t3[6] = t2[6]+t2[5]
michael@0 452 RSB r3, r6, r3, LSL #1 ; r3 = t3[5] = t2[6]-t2[5]
michael@0 453 ADD r11,r5, r11 ; r11= t[0]+t2[7]
michael@0 454 ADD r6, r4, r6 ; r6 = t[1]+t3[6]
michael@0 455 ADD r3, r10,r3 ; r3 = t[2]+t3[5]
michael@0 456 ADD r7, r9, r7 ; r7 = t[3]+t2[4]
michael@0 457 STRH r11,[r0], #2 ; y[0] = t[0]+t[7]
michael@0 458 STRH r6, [r0, #14] ; y[1] = t[1]+t2[6]
michael@0 459 STRH r3, [r0, #30] ; y[2] = t[2]+t2[5]
michael@0 460 STRH r7, [r0, #46] ; y[3] = t2[3]+t[4]
michael@0 461 RSB r11,r11,r5, LSL #1 ; r11= t[0]-t2[7]
michael@0 462 RSB r6, r6, r4, LSL #1 ; r6 = t[1]-t3[6]
michael@0 463 RSB r3, r3, r10,LSL #1 ; r3 = t[2]-t3[5]
michael@0 464 RSB r7, r7, r9, LSL #1 ; r7 = t[3]-t2[4]
michael@0 465 STRH r7, [r0, #62] ; y[4] = t2[3]-t[4]
michael@0 466 STRH r3, [r0, #78] ; y[5] = t[2]-t2[5]
michael@0 467 STRH r6, [r0, #94] ; y[6] = t[1]-t2[6]
michael@0 468 STRH r11, [r0, #110] ; y[7] = t2[0]-t[7]
michael@0 469 MOV PC,r14
michael@0 470 ENDP
michael@0 471
michael@0 472 idct4core_down_arm PROC
michael@0 473 ; r0 = ogg_int16_t *_y (destination)
michael@0 474 ; r1 = const ogg_int16_t *_x (source)
michael@0 475 LDRSH r9, [r1], #16 ; r9 = x[0]
michael@0 476 LDR r10,OC_C4S4 ; r10= OC_C4S4
michael@0 477 LDRSH r12,[r1, #-12] ; r12= x[2]
michael@0 478 LDR r4, OC_C6S2 ; r4 = OC_C6S2
michael@0 479 MUL r9, r10,r9 ; r9 = t[0]<<16 = OC_C4S4*x[0]
michael@0 480 LDR r5, OC_C2S6 ; r5 = OC_C2S6
michael@0 481 MUL r4, r12,r4 ; r4 = t[2]<<16 = OC_C6S2*x[2]
michael@0 482 LDRSH r3, [r1, #-14] ; r3 = x[1]
michael@0 483 MUL r5, r12,r5 ; r5 = t[3]<<16 = OC_C2S6*x[2]
michael@0 484 LDR r6, OC_C7S1 ; r6 = OC_C7S1
michael@0 485 LDR r12,OC_C1S7 ; r12= OC_C1S7
michael@0 486 LDRSH r11,[r1, #-10] ; r11= x[3]
michael@0 487 MUL r6, r3, r6 ; r6 = t[4]<<16 = OC_C7S1*x[1]
michael@0 488 LDR r7, OC_C5S3 ; r7 = OC_C5S3
michael@0 489 MUL r3, r12,r3 ; r3 = t[7]<<16 = OC_C1S7*x[1]
michael@0 490 LDR r8, OC_C3S5 ; r8 = OC_C3S5
michael@0 491 MUL r7, r11,r7 ; r7 = -t[5]<<16 = OC_C5S3*x[3]
michael@0 492 MOV r9, r9, ASR #16 ; r9 = t[0]
michael@0 493 MUL r11,r8, r11 ; r11= t[6]<<16 = OC_C3S5*x[3]
michael@0 494 MOV r6, r6, ASR #16 ; r6 = t[4]
michael@0 495 ; TODO: This is wrong; t[4]-t[5] and t[7]-t[6] need to be truncated to 16-bit
michael@0 496 ; before multiplying, not after (this is not equivalent)
michael@0 497 SUB r7, r6, r7, ASR #16 ; r7 = t2[4]=t[4]+t[5] (as r7=-t[5])
michael@0 498 RSB r6, r7, r6, LSL #1 ; r6 = t[4]-t[5]
michael@0 499 MUL r6, r10,r6 ; r6 = t2[5]<<16 =OC_C4S4*(t[4]-t[5])
michael@0 500 MOV r3, r3, ASR #16 ; r3 = t[7]
michael@0 501 ADD r11,r3, r11,ASR #16 ; r11= t2[7]=t[7]+t[6]
michael@0 502 RSB r3, r11,r3, LSL #1 ; r3 = t[7]-t[6]
michael@0 503 ADD r9, r9, #8 ; r9 = t[0]+8
michael@0 504 MUL r3, r10,r3 ; r3 = t2[6]<<16 =OC_C4S4*(t[7]-t[6])
michael@0 505 ADD r4, r9, r4, ASR #16 ; r4 = t[1] = t[0] + t[2] + 8
michael@0 506 RSB r10,r4, r9, LSL #1 ; r10= t[2] = t[0] - t[2] + 8
michael@0 507 ADD r5, r9, r5, ASR #16 ; r5 = t[0] = t[0] + t[3] + 8
michael@0 508 RSB r9, r5, r9, LSL #1 ; r9 = t[3] = t[0] - t[3] + 8
michael@0 509 MOV r3, r3, ASR #16 ; r3 = t2[6]
michael@0 510 ADD r6, r3, r6, ASR #16 ; r6 = t3[6] = t2[6]+t2[5]
michael@0 511 RSB r3, r6, r3, LSL #1 ; r3 = t3[5] = t2[6]-t2[5]
michael@0 512 ADD r5, r5, r11 ; r5 = t[0]+t2[7]+8
michael@0 513 ADD r4, r4, r6 ; r4 = t[1]+t3[6]+8
michael@0 514 ADD r10,r10,r3 ; r10= t[2]+t3[5]+8
michael@0 515 ADD r9, r9, r7 ; r9 = t[3]+t2[4]+8
michael@0 516 SUB r11,r5, r11,LSL #1 ; r11= t[0]-t2[7]+8
michael@0 517 SUB r6, r4, r6, LSL #1 ; r6 = t[1]-t3[6]+8
michael@0 518 SUB r3, r10,r3, LSL #1 ; r3 = t[2]-t3[5]+8
michael@0 519 SUB r7, r9, r7, LSL #1 ; r7 = t[3]-t2[4]+8
michael@0 520 ; TODO: This is wrong.
michael@0 521 ; The C code truncates to 16 bits by storing to RAM and doing the
michael@0 522 ; shifts later; we've got an extra 4 bits here.
michael@0 523 MOV r11,r11,ASR #4
michael@0 524 MOV r6, r6, ASR #4
michael@0 525 MOV r3, r3, ASR #4
michael@0 526 MOV r7, r7, ASR #4
michael@0 527 MOV r9, r9, ASR #4
michael@0 528 MOV r10,r10,ASR #4
michael@0 529 MOV r4, r4, ASR #4
michael@0 530 MOV r5, r5, ASR #4
michael@0 531 STRH r5,[r0], #2 ; y[0] = t[0]+t[7]
michael@0 532 STRH r4, [r0, #14] ; y[1] = t[1]+t2[6]
michael@0 533 STRH r10,[r0, #30] ; y[2] = t[2]+t2[5]
michael@0 534 STRH r9, [r0, #46] ; y[3] = t2[3]+t[4]
michael@0 535 STRH r7, [r0, #62] ; y[4] = t2[3]-t[4]
michael@0 536 STRH r3, [r0, #78] ; y[5] = t[2]-t2[5]
michael@0 537 STRH r6, [r0, #94] ; y[6] = t[1]-t2[6]
michael@0 538 STRH r11,[r0, #110] ; y[7] = t2[0]-t[7]
michael@0 539 MOV PC,r14
michael@0 540 ENDP
michael@0 541
michael@0 542 idct8core_arm PROC
michael@0 543 ; r0 = ogg_int16_t *_y (destination)
michael@0 544 ; r1 = const ogg_int16_t *_x (source)
michael@0 545 LDRSH r2, [r1],#16 ; r2 = x[0]
michael@0 546 STMFD r13!,{r1,r14}
michael@0 547 LDRSH r6, [r1, #-8] ; r6 = x[4]
michael@0 548 LDR r12,OC_C4S4 ; r12= C4S4
michael@0 549 LDRSH r4, [r1, #-12] ; r4 = x[2]
michael@0 550 ADD r2, r2, r6 ; r2 = x[0] + x[4]
michael@0 551 SUB r6, r2, r6, LSL #1 ; r6 = x[0] - x[4]
michael@0 552 ; For spec compliance, these sums must be truncated to 16-bit precision
michael@0 553 ; _before_ the multiply (not after).
michael@0 554 ; Sadly, ARMv4 provides no simple way to do that.
michael@0 555 MOV r2, r2, LSL #16
michael@0 556 MOV r6, r6, LSL #16
michael@0 557 MOV r2, r2, ASR #16
michael@0 558 MOV r6, r6, ASR #16
michael@0 559 MUL r2, r12,r2 ; r2 = t[0]<<16 = C4S4*(x[0]+x[4])
michael@0 560 LDRSH r8, [r1, #-4] ; r8 = x[6]
michael@0 561 LDR r7, OC_C6S2 ; r7 = OC_C6S2
michael@0 562 MUL r6, r12,r6 ; r6 = t[1]<<16 = C4S4*(x[0]-x[4])
michael@0 563 LDR r14,OC_C2S6 ; r14= OC_C2S6
michael@0 564 MUL r3, r4, r7 ; r3 = OC_C6S2*x[2]
michael@0 565 LDR r5, OC_C7S1 ; r5 = OC_C7S1
michael@0 566 MUL r4, r14,r4 ; r4 = OC_C2S6*x[2]
michael@0 567 MOV r3, r3, ASR #16 ; r3 = OC_C6S2*x[2]>>16
michael@0 568 MUL r14,r8, r14 ; r14= OC_C2S6*x[6]
michael@0 569 MOV r4, r4, ASR #16 ; r4 = OC_C2S6*x[2]>>16
michael@0 570 MUL r8, r7, r8 ; r8 = OC_C6S2*x[6]
michael@0 571 LDR r7, OC_C1S7 ; r7 = OC_C1S7
michael@0 572 SUB r3, r3, r14,ASR #16 ; r3=t[2]=C6S2*x[2]>>16-C2S6*x[6]>>16
michael@0 573 LDRSH r14,[r1, #-14] ; r14= x[1]
michael@0 574 ADD r4, r4, r8, ASR #16 ; r4=t[3]=C2S6*x[2]>>16+C6S2*x[6]>>16
michael@0 575 LDRSH r8, [r1, #-2] ; r8 = x[7]
michael@0 576 MUL r9, r5, r14 ; r9 = OC_C7S1*x[1]
michael@0 577 LDRSH r10,[r1, #-6] ; r10= x[5]
michael@0 578 MUL r14,r7, r14 ; r14= OC_C1S7*x[1]
michael@0 579 MOV r9, r9, ASR #16 ; r9 = OC_C7S1*x[1]>>16
michael@0 580 MUL r7, r8, r7 ; r7 = OC_C1S7*x[7]
michael@0 581 MOV r14,r14,ASR #16 ; r14= OC_C1S7*x[1]>>16
michael@0 582 MUL r8, r5, r8 ; r8 = OC_C7S1*x[7]
michael@0 583 LDRSH r1, [r1, #-10] ; r1 = x[3]
michael@0 584 LDR r5, OC_C3S5 ; r5 = OC_C3S5
michael@0 585 LDR r11,OC_C5S3 ; r11= OC_C5S3
michael@0 586 ADD r8, r14,r8, ASR #16 ; r8=t[7]=C1S7*x[1]>>16+C7S1*x[7]>>16
michael@0 587 MUL r14,r5, r10 ; r14= OC_C3S5*x[5]
michael@0 588 SUB r9, r9, r7, ASR #16 ; r9=t[4]=C7S1*x[1]>>16-C1S7*x[7]>>16
michael@0 589 MUL r10,r11,r10 ; r10= OC_C5S3*x[5]
michael@0 590 MOV r14,r14,ASR #16 ; r14= OC_C3S5*x[5]>>16
michael@0 591 MUL r11,r1, r11 ; r11= OC_C5S3*x[3]
michael@0 592 MOV r10,r10,ASR #16 ; r10= OC_C5S3*x[5]>>16
michael@0 593 MUL r1, r5, r1 ; r1 = OC_C3S5*x[3]
michael@0 594 SUB r14,r14,r11,ASR #16 ;r14=t[5]=C3S5*x[5]>>16-C5S3*x[3]>>16
michael@0 595 ADD r10,r10,r1, ASR #16 ;r10=t[6]=C5S3*x[5]>>16+C3S5*x[3]>>16
michael@0 596 ; r2=t[0]<<16 r3=t[2] r4=t[3] r6=t[1]<<16 r8=t[7] r9=t[4]
michael@0 597 ; r10=t[6] r12=C4S4 r14=t[5]
michael@0 598 ; TODO: This is wrong; t[4]-t[5] and t[7]-t[6] need to be truncated to 16-bit
michael@0 599 ; before multiplying, not after (this is not equivalent)
michael@0 600 ; Stage 2
michael@0 601 ; 4-5 butterfly
michael@0 602 ADD r9, r9, r14 ; r9 = t2[4] = t[4]+t[5]
michael@0 603 SUB r14,r9, r14, LSL #1 ; r14= t[4]-t[5]
michael@0 604 MUL r14,r12,r14 ; r14= t2[5]<<16 = C4S4*(t[4]-t[5])
michael@0 605 ; 7-6 butterfly
michael@0 606 ADD r8, r8, r10 ; r8 = t2[7] = t[7]+t[6]
michael@0 607 SUB r10,r8, r10, LSL #1 ; r10= t[7]-t[6]
michael@0 608 MUL r10,r12,r10 ; r10= t2[6]<<16 = C4S4*(t[7]+t[6])
michael@0 609 ; r2=t[0]<<16 r3=t[2] r4=t[3] r6=t[1]<<16 r8=t2[7] r9=t2[4]
michael@0 610 ; r10=t2[6]<<16 r12=C4S4 r14=t2[5]<<16
michael@0 611 ; Stage 3
michael@0 612 ; 0-3 butterfly
michael@0 613 ADD r2, r4, r2, ASR #16 ; r2 = t2[0] = t[0] + t[3]
michael@0 614 SUB r4, r2, r4, LSL #1 ; r4 = t2[3] = t[0] - t[3]
michael@0 615 ; 1-2 butterfly
michael@0 616 ADD r6, r3, r6, ASR #16 ; r6 = t2[1] = t[1] + t[2]
michael@0 617 SUB r3, r6, r3, LSL #1 ; r3 = t2[2] = t[1] - t[2]
michael@0 618 ; 6-5 butterfly
michael@0 619 MOV r14,r14,ASR #16 ; r14= t2[5]
michael@0 620 ADD r10,r14,r10,ASR #16 ; r10= t3[6] = t[6] + t[5]
michael@0 621 SUB r14,r10,r14,LSL #1 ; r14= t3[5] = t[6] - t[5]
michael@0 622 ; r2=t2[0] r3=t2[2] r4=t2[3] r6=t2[1] r8=t2[7] r9=t2[4]
michael@0 623 ; r10=t3[6] r14=t3[5]
michael@0 624 ; Stage 4
michael@0 625 ADD r2, r2, r8 ; r2 = t[0] + t[7]
michael@0 626 ADD r6, r6, r10 ; r6 = t[1] + t[6]
michael@0 627 ADD r3, r3, r14 ; r3 = t[2] + t[5]
michael@0 628 ADD r4, r4, r9 ; r4 = t[3] + t[4]
michael@0 629 SUB r8, r2, r8, LSL #1 ; r8 = t[0] - t[7]
michael@0 630 SUB r10,r6, r10,LSL #1 ; r10= t[1] - t[6]
michael@0 631 SUB r14,r3, r14,LSL #1 ; r14= t[2] - t[5]
michael@0 632 SUB r9, r4, r9, LSL #1 ; r9 = t[3] - t[4]
michael@0 633 STRH r2, [r0], #2 ; y[0] = t[0]+t[7]
michael@0 634 STRH r6, [r0, #14] ; y[1] = t[1]+t[6]
michael@0 635 STRH r3, [r0, #30] ; y[2] = t[2]+t[5]
michael@0 636 STRH r4, [r0, #46] ; y[3] = t[3]+t[4]
michael@0 637 STRH r9, [r0, #62] ; y[4] = t[3]-t[4]
michael@0 638 STRH r14,[r0, #78] ; y[5] = t[2]-t[5]
michael@0 639 STRH r10,[r0, #94] ; y[6] = t[1]-t[6]
michael@0 640 STRH r8, [r0, #110] ; y[7] = t[0]-t[7]
michael@0 641 LDMFD r13!,{r1,PC}
michael@0 642 ENDP
michael@0 643
michael@0 644 idct8core_down_arm PROC
michael@0 645 ; r0 = ogg_int16_t *_y (destination)
michael@0 646 ; r1 = const ogg_int16_t *_x (source)
michael@0 647 LDRSH r2, [r1],#16 ; r2 = x[0]
michael@0 648 STMFD r13!,{r1,r14}
michael@0 649 LDRSH r6, [r1, #-8] ; r6 = x[4]
michael@0 650 LDR r12,OC_C4S4 ; r12= C4S4
michael@0 651 LDRSH r4, [r1, #-12] ; r4 = x[2]
michael@0 652 ADD r2, r2, r6 ; r2 = x[0] + x[4]
michael@0 653 SUB r6, r2, r6, LSL #1 ; r6 = x[0] - x[4]
michael@0 654 ; For spec compliance, these sums must be truncated to 16-bit precision
michael@0 655 ; _before_ the multiply (not after).
michael@0 656 ; Sadly, ARMv4 provides no simple way to do that.
michael@0 657 MOV r2, r2, LSL #16
michael@0 658 MOV r6, r6, LSL #16
michael@0 659 MOV r2, r2, ASR #16
michael@0 660 MOV r6, r6, ASR #16
michael@0 661 MUL r2, r12,r2 ; r2 = t[0]<<16 = C4S4*(x[0]+x[4])
michael@0 662 LDRSH r8, [r1, #-4] ; r8 = x[6]
michael@0 663 LDR r7, OC_C6S2 ; r7 = OC_C6S2
michael@0 664 MUL r6, r12,r6 ; r6 = t[1]<<16 = C4S4*(x[0]-x[4])
michael@0 665 LDR r14,OC_C2S6 ; r14= OC_C2S6
michael@0 666 MUL r3, r4, r7 ; r3 = OC_C6S2*x[2]
michael@0 667 LDR r5, OC_C7S1 ; r5 = OC_C7S1
michael@0 668 MUL r4, r14,r4 ; r4 = OC_C2S6*x[2]
michael@0 669 MOV r3, r3, ASR #16 ; r3 = OC_C6S2*x[2]>>16
michael@0 670 MUL r14,r8, r14 ; r14= OC_C2S6*x[6]
michael@0 671 MOV r4, r4, ASR #16 ; r4 = OC_C2S6*x[2]>>16
michael@0 672 MUL r8, r7, r8 ; r8 = OC_C6S2*x[6]
michael@0 673 LDR r7, OC_C1S7 ; r7 = OC_C1S7
michael@0 674 SUB r3, r3, r14,ASR #16 ; r3=t[2]=C6S2*x[2]>>16-C2S6*x[6]>>16
michael@0 675 LDRSH r14,[r1, #-14] ; r14= x[1]
michael@0 676 ADD r4, r4, r8, ASR #16 ; r4=t[3]=C2S6*x[2]>>16+C6S2*x[6]>>16
michael@0 677 LDRSH r8, [r1, #-2] ; r8 = x[7]
michael@0 678 MUL r9, r5, r14 ; r9 = OC_C7S1*x[1]
michael@0 679 LDRSH r10,[r1, #-6] ; r10= x[5]
michael@0 680 MUL r14,r7, r14 ; r14= OC_C1S7*x[1]
michael@0 681 MOV r9, r9, ASR #16 ; r9 = OC_C7S1*x[1]>>16
michael@0 682 MUL r7, r8, r7 ; r7 = OC_C1S7*x[7]
michael@0 683 MOV r14,r14,ASR #16 ; r14= OC_C1S7*x[1]>>16
michael@0 684 MUL r8, r5, r8 ; r8 = OC_C7S1*x[7]
michael@0 685 LDRSH r1, [r1, #-10] ; r1 = x[3]
michael@0 686 LDR r5, OC_C3S5 ; r5 = OC_C3S5
michael@0 687 LDR r11,OC_C5S3 ; r11= OC_C5S3
michael@0 688 ADD r8, r14,r8, ASR #16 ; r8=t[7]=C1S7*x[1]>>16+C7S1*x[7]>>16
michael@0 689 MUL r14,r5, r10 ; r14= OC_C3S5*x[5]
michael@0 690 SUB r9, r9, r7, ASR #16 ; r9=t[4]=C7S1*x[1]>>16-C1S7*x[7]>>16
michael@0 691 MUL r10,r11,r10 ; r10= OC_C5S3*x[5]
michael@0 692 MOV r14,r14,ASR #16 ; r14= OC_C3S5*x[5]>>16
michael@0 693 MUL r11,r1, r11 ; r11= OC_C5S3*x[3]
michael@0 694 MOV r10,r10,ASR #16 ; r10= OC_C5S3*x[5]>>16
michael@0 695 MUL r1, r5, r1 ; r1 = OC_C3S5*x[3]
michael@0 696 SUB r14,r14,r11,ASR #16 ;r14=t[5]=C3S5*x[5]>>16-C5S3*x[3]>>16
michael@0 697 ADD r10,r10,r1, ASR #16 ;r10=t[6]=C5S3*x[5]>>16+C3S5*x[3]>>16
michael@0 698 ; r2=t[0]<<16 r3=t[2] r4=t[3] r6=t[1]<<16 r8=t[7] r9=t[4]
michael@0 699 ; r10=t[6] r12=C4S4 r14=t[5]
michael@0 700 ; Stage 2
michael@0 701 ; TODO: This is wrong; t[4]-t[5] and t[7]-t[6] need to be truncated to 16-bit
michael@0 702 ; before multiplying, not after (this is not equivalent)
michael@0 703 ; 4-5 butterfly
michael@0 704 ADD r9, r9, r14 ; r9 = t2[4] = t[4]+t[5]
michael@0 705 SUB r14,r9, r14, LSL #1 ; r14= t[4]-t[5]
michael@0 706 MUL r14,r12,r14 ; r14= t2[5]<<16 = C4S4*(t[4]-t[5])
michael@0 707 ; 7-6 butterfly
michael@0 708 ADD r8, r8, r10 ; r8 = t2[7] = t[7]+t[6]
michael@0 709 SUB r10,r8, r10, LSL #1 ; r10= t[7]-t[6]
michael@0 710 MUL r10,r12,r10 ; r10= t2[6]<<16 = C4S4*(t[7]+t[6])
michael@0 711 ; r2=t[0]<<16 r3=t[2] r4=t[3] r6=t[1]<<16 r8=t2[7] r9=t2[4]
michael@0 712 ; r10=t2[6]<<16 r12=C4S4 r14=t2[5]<<16
michael@0 713 ; Stage 3
michael@0 714 ADD r2, r2, #8<<16 ; r2 = t[0]+8<<16
michael@0 715 ADD r6, r6, #8<<16 ; r6 = t[1]+8<<16
michael@0 716 ; 0-3 butterfly
michael@0 717 ADD r2, r4, r2, ASR #16 ; r2 = t2[0] = t[0] + t[3] + 8
michael@0 718 SUB r4, r2, r4, LSL #1 ; r4 = t2[3] = t[0] - t[3] + 8
michael@0 719 ; 1-2 butterfly
michael@0 720 ADD r6, r3, r6, ASR #16 ; r6 = t2[1] = t[1] + t[2] + 8
michael@0 721 SUB r3, r6, r3, LSL #1 ; r3 = t2[2] = t[1] - t[2] + 8
michael@0 722 ; 6-5 butterfly
michael@0 723 MOV r14,r14,ASR #16 ; r14= t2[5]
michael@0 724 ADD r10,r14,r10,ASR #16 ; r10= t3[6] = t[6] + t[5]
michael@0 725 SUB r14,r10,r14,LSL #1 ; r14= t3[5] = t[6] - t[5]
michael@0 726 ; r2=t2[0] r3=t2[2] r4=t2[3] r6=t2[1] r8=t2[7] r9=t2[4]
michael@0 727 ; r10=t3[6] r14=t3[5]
michael@0 728 ; Stage 4
michael@0 729 ADD r2, r2, r8 ; r2 = t[0] + t[7] + 8
michael@0 730 ADD r6, r6, r10 ; r6 = t[1] + t[6] + 8
michael@0 731 ADD r3, r3, r14 ; r3 = t[2] + t[5] + 8
michael@0 732 ADD r4, r4, r9 ; r4 = t[3] + t[4] + 8
michael@0 733 SUB r8, r2, r8, LSL #1 ; r8 = t[0] - t[7] + 8
michael@0 734 SUB r10,r6, r10,LSL #1 ; r10= t[1] - t[6] + 8
michael@0 735 SUB r14,r3, r14,LSL #1 ; r14= t[2] - t[5] + 8
michael@0 736 SUB r9, r4, r9, LSL #1 ; r9 = t[3] - t[4] + 8
michael@0 737 ; TODO: This is wrong.
michael@0 738 ; The C code truncates to 16 bits by storing to RAM and doing the
michael@0 739 ; shifts later; we've got an extra 4 bits here.
michael@0 740 MOV r2, r2, ASR #4
michael@0 741 MOV r6, r6, ASR #4
michael@0 742 MOV r3, r3, ASR #4
michael@0 743 MOV r4, r4, ASR #4
michael@0 744 MOV r8, r8, ASR #4
michael@0 745 MOV r10,r10,ASR #4
michael@0 746 MOV r14,r14,ASR #4
michael@0 747 MOV r9, r9, ASR #4
michael@0 748 STRH r2, [r0], #2 ; y[0] = t[0]+t[7]
michael@0 749 STRH r6, [r0, #14] ; y[1] = t[1]+t[6]
michael@0 750 STRH r3, [r0, #30] ; y[2] = t[2]+t[5]
michael@0 751 STRH r4, [r0, #46] ; y[3] = t[3]+t[4]
michael@0 752 STRH r9, [r0, #62] ; y[4] = t[3]-t[4]
michael@0 753 STRH r14,[r0, #78] ; y[5] = t[2]-t[5]
michael@0 754 STRH r10,[r0, #94] ; y[6] = t[1]-t[6]
michael@0 755 STRH r8, [r0, #110] ; y[7] = t[0]-t[7]
michael@0 756 LDMFD r13!,{r1,PC}
michael@0 757 ENDP
michael@0 758
michael@0 759 [ OC_ARM_ASM_MEDIA
michael@0 760 EXPORT oc_idct8x8_1_v6
michael@0 761 EXPORT oc_idct8x8_v6
michael@0 762
michael@0 763 oc_idct8x8_1_v6 PROC
michael@0 764 ; r0 = ogg_int16_t *_y
michael@0 765 ; r1 = ogg_uint16_t _dc
michael@0 766 ORR r2, r1, r1, LSL #16
michael@0 767 ORR r3, r1, r1, LSL #16
michael@0 768 STRD r2, [r0], #8
michael@0 769 STRD r2, [r0], #8
michael@0 770 STRD r2, [r0], #8
michael@0 771 STRD r2, [r0], #8
michael@0 772 STRD r2, [r0], #8
michael@0 773 STRD r2, [r0], #8
michael@0 774 STRD r2, [r0], #8
michael@0 775 STRD r2, [r0], #8
michael@0 776 STRD r2, [r0], #8
michael@0 777 STRD r2, [r0], #8
michael@0 778 STRD r2, [r0], #8
michael@0 779 STRD r2, [r0], #8
michael@0 780 STRD r2, [r0], #8
michael@0 781 STRD r2, [r0], #8
michael@0 782 STRD r2, [r0], #8
michael@0 783 STRD r2, [r0], #8
michael@0 784 MOV PC, r14
michael@0 785 ENDP
michael@0 786
michael@0 787 oc_idct8x8_v6 PROC
michael@0 788 ; r0 = ogg_int16_t *_y
michael@0 789 ; r1 = ogg_int16_t *_x
michael@0 790 ; r2 = int _last_zzi
michael@0 791 CMP r2, #3
michael@0 792 BLE oc_idct8x8_3_v6
michael@0 793 ;CMP r2, #6
michael@0 794 ;BLE oc_idct8x8_6_v6
michael@0 795 CMP r2, #10
michael@0 796 BLE oc_idct8x8_10_v6
michael@0 797 oc_idct8x8_slow_v6
michael@0 798 STMFD r13!,{r4-r11,r14}
michael@0 799 SUB r13,r13,#64*2
michael@0 800 ; Row transforms
michael@0 801 STR r0, [r13,#-4]!
michael@0 802 ADD r0, r13, #4 ; Write to temp storage.
michael@0 803 BL idct8_8core_v6
michael@0 804 BL idct8_8core_v6
michael@0 805 BL idct8_8core_v6
michael@0 806 BL idct8_8core_v6
michael@0 807 LDR r0, [r13], #4 ; Write to the final destination.
michael@0 808 ; Clear input data for next block (decoder only).
michael@0 809 SUB r2, r1, #8*16
michael@0 810 CMP r0, r2
michael@0 811 MOV r1, r13 ; And read from temp storage.
michael@0 812 BEQ oc_idct8x8_slow_v6_cols
michael@0 813 MOV r4, #0
michael@0 814 MOV r5, #0
michael@0 815 STRD r4, [r2], #8
michael@0 816 STRD r4, [r2], #8
michael@0 817 STRD r4, [r2], #8
michael@0 818 STRD r4, [r2], #8
michael@0 819 STRD r4, [r2], #8
michael@0 820 STRD r4, [r2], #8
michael@0 821 STRD r4, [r2], #8
michael@0 822 STRD r4, [r2], #8
michael@0 823 STRD r4, [r2], #8
michael@0 824 STRD r4, [r2], #8
michael@0 825 STRD r4, [r2], #8
michael@0 826 STRD r4, [r2], #8
michael@0 827 STRD r4, [r2], #8
michael@0 828 STRD r4, [r2], #8
michael@0 829 STRD r4, [r2], #8
michael@0 830 STRD r4, [r2], #8
michael@0 831 oc_idct8x8_slow_v6_cols
michael@0 832 ; Column transforms
michael@0 833 BL idct8_8core_down_v6
michael@0 834 BL idct8_8core_down_v6
michael@0 835 BL idct8_8core_down_v6
michael@0 836 BL idct8_8core_down_v6
michael@0 837 ADD r13,r13,#64*2
michael@0 838 LDMFD r13!,{r4-r11,PC}
michael@0 839 ENDP
michael@0 840
michael@0 841 oc_idct8x8_10_v6 PROC
michael@0 842 STMFD r13!,{r4-r11,r14}
michael@0 843 SUB r13,r13,#64*2+4
michael@0 844 ; Row transforms
michael@0 845 MOV r2, r13
michael@0 846 STR r0, [r13,#-4]!
michael@0 847 AND r0, r2, #4 ; Align the stack.
michael@0 848 ADD r0, r0, r2 ; Write to temp storage.
michael@0 849 BL idct4_3core_v6
michael@0 850 BL idct2_1core_v6
michael@0 851 LDR r0, [r13], #4 ; Write to the final destination.
michael@0 852 ; Clear input data for next block (decoder only).
michael@0 853 SUB r2, r1, #4*16
michael@0 854 CMP r0, r2
michael@0 855 AND r1, r13,#4 ; Align the stack.
michael@0 856 BEQ oc_idct8x8_10_v6_cols
michael@0 857 MOV r4, #0
michael@0 858 MOV r5, #0
michael@0 859 STRD r4, [r2]
michael@0 860 STRD r4, [r2,#16]
michael@0 861 STR r4, [r2,#32]
michael@0 862 STR r4, [r2,#48]
michael@0 863 oc_idct8x8_10_v6_cols
michael@0 864 ; Column transforms
michael@0 865 ADD r1, r1, r13 ; And read from temp storage.
michael@0 866 BL idct4_4core_down_v6
michael@0 867 BL idct4_4core_down_v6
michael@0 868 BL idct4_4core_down_v6
michael@0 869 BL idct4_4core_down_v6
michael@0 870 ADD r13,r13,#64*2+4
michael@0 871 LDMFD r13!,{r4-r11,PC}
michael@0 872 ENDP
michael@0 873
michael@0 874 oc_idct8x8_3_v6 PROC
michael@0 875 STMFD r13!,{r4-r8,r14}
michael@0 876 SUB r13,r13,#64*2
michael@0 877 ; Row transforms
michael@0 878 MOV r8, r0
michael@0 879 MOV r0, r13 ; Write to temp storage.
michael@0 880 BL idct2_1core_v6
michael@0 881 ; Clear input data for next block (decoder only).
michael@0 882 SUB r0, r1, #2*16
michael@0 883 CMP r0, r8
michael@0 884 MOV r1, r13 ; Read from temp storage.
michael@0 885 MOVNE r4, #0
michael@0 886 STRNE r4, [r0]
michael@0 887 STRNE r4, [r0,#16]
michael@0 888 MOVNE r0, r8 ; Write to the final destination.
michael@0 889 ; Column transforms
michael@0 890 BL idct2_2core_down_v6
michael@0 891 BL idct2_2core_down_v6
michael@0 892 BL idct2_2core_down_v6
michael@0 893 BL idct2_2core_down_v6
michael@0 894 ADD r13,r13,#64*2
michael@0 895 LDMFD r13!,{r4-r8,PC}
michael@0 896 ENDP
michael@0 897
michael@0 898 idct2_1core_v6 PROC
michael@0 899 ; r0 = ogg_int16_t *_y (destination)
michael@0 900 ; r1 = const ogg_int16_t *_x (source)
michael@0 901 ; Stage 1:
michael@0 902 LDR r2, [r1], #16 ; r2 = <x[0,1]|x[0,0]>
michael@0 903 LDR r3, OC_C4S4
michael@0 904 LDRSH r6, [r1], #16 ; r6 = x[1,0]
michael@0 905 SMULWB r12,r3, r2 ; r12= t[0,0]=OC_C4S4*x[0,0]>>16
michael@0 906 LDRD r4, OC_C7S1 ; r4 = OC_C7S1; r5 = OC_C1S7
michael@0 907 SMULWB r6, r3, r6 ; r6 = t[1,0]=OC_C4S4*x[1,0]>>16
michael@0 908 SMULWT r4, r4, r2 ; r4 = t[0,4]=OC_C7S1*x[0,1]>>16
michael@0 909 SMULWT r7, r5, r2 ; r7 = t[0,7]=OC_C1S7*x[0,1]>>16
michael@0 910 ; Stage 2:
michael@0 911 SMULWB r5, r3, r4 ; r5 = t[0,5]=OC_C4S4*t[0,4]>>16
michael@0 912 PKHBT r12,r12,r6, LSL #16 ; r12= <t[1,0]|t[0,0]>
michael@0 913 SMULWB r6, r3, r7 ; r6 = t[0,6]=OC_C4S4*t[0,7]>>16
michael@0 914 PKHBT r7, r7, r3 ; r7 = <0|t[0,7]>
michael@0 915 ; Stage 3:
michael@0 916 PKHBT r5, r6, r5, LSL #16 ; r5 = <t[0,5]|t[0,6]>
michael@0 917 PKHBT r4, r4, r3 ; r4 = <0|t[0,4]>
michael@0 918 SASX r5, r5, r5 ; r5 = <t[0,6]+t[0,5]|t[0,6]-t[0,5]>
michael@0 919 ; Stage 4:
michael@0 920 PKHTB r6, r3, r5, ASR #16 ; r6 = <0|t[0,6]>
michael@0 921 PKHBT r5, r5, r3 ; r5 = <0|t[0,5]>
michael@0 922 SADD16 r3, r12,r7 ; r3 = t[0]+t[7]
michael@0 923 STR r3, [r0], #4 ; y[0<<3] = t[0]+t[7]
michael@0 924 SADD16 r3, r12,r6 ; r3 = t[0]+t[6]
michael@0 925 STR r3, [r0, #12] ; y[1<<3] = t[0]+t[6]
michael@0 926 SADD16 r3, r12,r5 ; r3 = t[0]+t[5]
michael@0 927 STR r3, [r0, #28] ; y[2<<3] = t[0]+t[5]
michael@0 928 SADD16 r3, r12,r4 ; r3 = t[0]+t[4]
michael@0 929 STR r3, [r0, #44] ; y[3<<3] = t[0]+t[4]
michael@0 930 SSUB16 r4, r12,r4 ; r4 = t[0]-t[4]
michael@0 931 STR r4, [r0, #60] ; y[4<<3] = t[0]-t[4]
michael@0 932 SSUB16 r5, r12,r5 ; r5 = t[0]-t[5]
michael@0 933 STR r5, [r0, #76] ; y[5<<3] = t[0]-t[5]
michael@0 934 SSUB16 r6, r12,r6 ; r6 = t[0]-t[6]
michael@0 935 STR r6, [r0, #92] ; y[6<<3] = t[0]-t[6]
michael@0 936 SSUB16 r7, r12,r7 ; r7 = t[0]-t[7]
michael@0 937 STR r7, [r0, #108] ; y[7<<3] = t[0]-t[7]
michael@0 938 MOV PC,r14
michael@0 939 ENDP
michael@0 940 ]
michael@0 941
michael@0 942 ALIGN 8
michael@0 943 OC_C7S1
michael@0 944 DCD 12785 ; 31F1
michael@0 945 OC_C1S7
michael@0 946 DCD 64277 ; FB15
michael@0 947 OC_C6S2
michael@0 948 DCD 25080 ; 61F8
michael@0 949 OC_C2S6
michael@0 950 DCD 60547 ; EC83
michael@0 951 OC_C5S3
michael@0 952 DCD 36410 ; 8E3A
michael@0 953 OC_C3S5
michael@0 954 DCD 54491 ; D4DB
michael@0 955 OC_C4S4
michael@0 956 DCD 46341 ; B505
michael@0 957
michael@0 958 [ OC_ARM_ASM_MEDIA
michael@0 959 idct2_2core_down_v6 PROC
michael@0 960 ; r0 = ogg_int16_t *_y (destination)
michael@0 961 ; r1 = const ogg_int16_t *_x (source)
michael@0 962 ; Stage 1:
michael@0 963 LDR r2, [r1], #16 ; r2 = <x[0,1]|x[0,0]>
michael@0 964 LDR r3, OC_C4S4
michael@0 965 MOV r7 ,#8 ; r7 = 8
michael@0 966 LDR r6, [r1], #16 ; r6 = <x[1,1]|x[1,0]>
michael@0 967 SMLAWB r12,r3, r2, r7 ; r12= (t[0,0]=OC_C4S4*x[0,0]>>16)+8
michael@0 968 LDRD r4, OC_C7S1 ; r4 = OC_C7S1; r5 = OC_C1S7
michael@0 969 SMLAWB r7, r3, r6, r7 ; r7 = (t[1,0]=OC_C4S4*x[1,0]>>16)+8
michael@0 970 SMULWT r5, r5, r2 ; r2 = t[0,7]=OC_C1S7*x[0,1]>>16
michael@0 971 PKHBT r12,r12,r7, LSL #16 ; r12= <t[1,0]+8|t[0,0]+8>
michael@0 972 SMULWT r4, r4, r2 ; r4 = t[0,4]=OC_C7S1*x[0,1]>>16
michael@0 973 ; Here we cheat: row 1 had just a DC, so x[0,1]==x[1,1] by definition.
michael@0 974 PKHBT r7, r5, r5, LSL #16 ; r7 = <t[0,7]|t[0,7]>
michael@0 975 ; Stage 2:
michael@0 976 SMULWB r6, r3, r7 ; r6 = t[0,6]=OC_C4S4*t[0,7]>>16
michael@0 977 PKHBT r4, r4, r4, LSL #16 ; r4 = <t[0,4]|t[0,4]>
michael@0 978 SMULWT r2, r3, r7 ; r2 = t[1,6]=OC_C4S4*t[1,7]>>16
michael@0 979 SMULWB r5, r3, r4 ; r5 = t[0,5]=OC_C4S4*t[0,4]>>16
michael@0 980 PKHBT r6, r6, r2, LSL #16 ; r6 = <t[1,6]|t[0,6]>
michael@0 981 SMULWT r2, r3, r4 ; r2 = t[1,5]=OC_C4S4*t[1,4]>>16
michael@0 982 PKHBT r2, r5, r2, LSL #16 ; r2 = <t[1,5]|t[0,5]>
michael@0 983 ; Stage 3:
michael@0 984 SSUB16 r5, r6, r2 ; r5 = <t[1,6]-t[1,5]|t[0,6]-t[0,5]>
michael@0 985 SADD16 r6, r6, r2 ; r6 = <t[1,6]+t[1,5]|t[0,6]+t[0,5]>
michael@0 986 ; Stage 4:
michael@0 987 SADD16 r2, r12,r7 ; r2 = t[0]+t[7]+8
michael@0 988 MOV r3, r2, ASR #4
michael@0 989 MOV r2, r2, LSL #16
michael@0 990 PKHTB r3, r3, r2, ASR #20 ; r3 = t[0]+t[7]+8>>4
michael@0 991 STR r3, [r0], #4 ; y[0<<3] = t[0]+t[7]+8>>4
michael@0 992 SADD16 r2, r12,r6 ; r2 = t[0]+t[6]+8
michael@0 993 MOV r3, r2, ASR #4
michael@0 994 MOV r2, r2, LSL #16
michael@0 995 PKHTB r3, r3, r2, ASR #20 ; r3 = t[0]+t[6]+8>>4
michael@0 996 STR r3, [r0, #12] ; y[1<<3] = t[0]+t[6]+8>>4
michael@0 997 SADD16 r2, r12,r5 ; r2 = t[0]+t[5]+8
michael@0 998 MOV r3, r2, ASR #4
michael@0 999 MOV r2, r2, LSL #16
michael@0 1000 PKHTB r3, r3, r2, ASR #20 ; r3 = t[0]+t[5]+8>>4
michael@0 1001 STR r3, [r0, #28] ; y[2<<3] = t[0]+t[5]+8>>4
michael@0 1002 SADD16 r2, r12,r4 ; r2 = t[0]+t[4]+8
michael@0 1003 MOV r3, r2, ASR #4
michael@0 1004 MOV r2, r2, LSL #16
michael@0 1005 PKHTB r3, r3, r2, ASR #20 ; r3 = t[0]+t[4]+8>>4
michael@0 1006 STR r3, [r0, #44] ; y[3<<3] = t[0]+t[4]+8>>4
michael@0 1007 SSUB16 r4, r12,r4 ; r4 = t[0]-t[4]+8
michael@0 1008 MOV r3, r4, ASR #4
michael@0 1009 MOV r4, r4, LSL #16
michael@0 1010 PKHTB r3, r3, r4, ASR #20 ; r3 = t[0]-t[4]+8>>4
michael@0 1011 STR r3, [r0, #60] ; y[4<<3] = t[0]-t[4]+8>>4
michael@0 1012 SSUB16 r5, r12,r5 ; r5 = t[0]-t[5]+8
michael@0 1013 MOV r3, r5, ASR #4
michael@0 1014 MOV r5, r5, LSL #16
michael@0 1015 PKHTB r3, r3, r5, ASR #20 ; r3 = t[0]-t[5]+8>>4
michael@0 1016 STR r3, [r0, #76] ; y[5<<3] = t[0]-t[5]+8>>4
michael@0 1017 SSUB16 r6, r12,r6 ; r6 = t[0]-t[6]+8
michael@0 1018 MOV r3, r6, ASR #4
michael@0 1019 MOV r6, r6, LSL #16
michael@0 1020 PKHTB r3, r3, r6, ASR #20 ; r3 = t[0]-t[6]+8>>4
michael@0 1021 STR r3, [r0, #92] ; y[6<<3] = t[0]-t[6]+8>>4
michael@0 1022 SSUB16 r7, r12,r7 ; r7 = t[0]-t[7]+8
michael@0 1023 MOV r3, r7, ASR #4
michael@0 1024 MOV r7, r7, LSL #16
michael@0 1025 PKHTB r3, r3, r7, ASR #20 ; r3 = t[0]-t[7]+8>>4
michael@0 1026 STR r3, [r0, #108] ; y[7<<3] = t[0]-t[7]+8>>4
michael@0 1027 MOV PC,r14
michael@0 1028 ENDP
michael@0 1029
michael@0 1030 ; In theory this should save ~75 cycles over oc_idct8x8_10, more than enough to
michael@0 1031 ; pay for increased branch mis-prediction to get here, but in practice it
michael@0 1032 ; doesn't seem to slow anything down to take it out, and it's less code this
michael@0 1033 ; way.
michael@0 1034 [ 0
michael@0 1035 oc_idct8x8_6_v6 PROC
michael@0 1036 STMFD r13!,{r4-r8,r10,r11,r14}
michael@0 1037 SUB r13,r13,#64*2+4
michael@0 1038 ; Row transforms
michael@0 1039 MOV r8, r0
michael@0 1040 AND r0, r13,#4 ; Align the stack.
michael@0 1041 ADD r0, r0, r13 ; Write to temp storage.
michael@0 1042 BL idct3_2core_v6
michael@0 1043 BL idct1core_v6
michael@0 1044 ; Clear input data for next block (decoder only).
michael@0 1045 SUB r0, r1, #3*16
michael@0 1046 CMP r0, r8
michael@0 1047 AND r1, r13,#4 ; Align the stack.
michael@0 1048 BEQ oc_idct8x8_6_v6_cols
michael@0 1049 MOV r4, #0
michael@0 1050 MOV r5, #0
michael@0 1051 STRD r4, [r0]
michael@0 1052 STR r4, [r0,#16]
michael@0 1053 STR r4, [r0,#32]
michael@0 1054 MOV r0, r8 ; Write to the final destination.
michael@0 1055 oc_idct8x8_6_v6_cols
michael@0 1056 ; Column transforms
michael@0 1057 ADD r1, r1, r13 ; And read from temp storage.
michael@0 1058 BL idct3_3core_down_v6
michael@0 1059 BL idct3_3core_down_v6
michael@0 1060 BL idct3_3core_down_v6
michael@0 1061 BL idct3_3core_down_v6
michael@0 1062 ADD r13,r13,#64*2+4
michael@0 1063 LDMFD r13!,{r4-r8,r10,r11,PC}
michael@0 1064 ENDP
michael@0 1065
michael@0 1066 idct1core_v6 PROC
michael@0 1067 ; r0 = ogg_int16_t *_y (destination)
michael@0 1068 ; r1 = const ogg_int16_t *_x (source)
michael@0 1069 LDRSH r3, [r1], #16
michael@0 1070 MOV r12,#0x05
michael@0 1071 ORR r12,r12,#0xB500
michael@0 1072 MUL r3, r12, r3
michael@0 1073 ; Stall ?
michael@0 1074 MOV r3, r3, ASR #16
michael@0 1075 ; Don't need to actually store the odd lines; they won't be read.
michael@0 1076 STRH r3, [r0], #2
michael@0 1077 STRH r3, [r0, #30]
michael@0 1078 STRH r3, [r0, #62]
michael@0 1079 STRH r3, [r0, #94]
michael@0 1080 MOV PC,R14
michael@0 1081 ENDP
michael@0 1082
michael@0 1083 idct3_2core_v6 PROC
michael@0 1084 ; r0 = ogg_int16_t *_y (destination)
michael@0 1085 ; r1 = const ogg_int16_t *_x (source)
michael@0 1086 ; Stage 1:
michael@0 1087 LDRD r4, [r1], #16 ; r4 = <x[0,1]|x[0,0]>; r5 = <*|x[0,2]>
michael@0 1088 LDRD r10,OC_C6S2_3_v6 ; r10= OC_C6S2; r11= OC_C2S6
michael@0 1089 ; Stall
michael@0 1090 SMULWB r3, r11,r5 ; r3 = t[0,3]=OC_C2S6*x[0,2]>>16
michael@0 1091 LDR r11,OC_C4S4
michael@0 1092 SMULWB r2, r10,r5 ; r2 = t[0,2]=OC_C6S2*x[0,2]>>16
michael@0 1093 LDR r5, [r1], #16 ; r5 = <x[1,1]|x[1,0]>
michael@0 1094 SMULWB r12,r11,r4 ; r12= (t[0,0]=OC_C4S4*x[0,0]>>16)
michael@0 1095 LDRD r6, OC_C7S1_3_v6 ; r6 = OC_C7S1; r7 = OC_C1S7
michael@0 1096 SMULWB r10,r11,r5 ; r10= (t[1,0]=OC_C4S4*x[1,0]>>16)
michael@0 1097 PKHBT r12,r12,r10,LSL #16 ; r12= <t[1,0]|t[0,0]>
michael@0 1098 SMULWT r10,r7, r5 ; r10= t[1,7]=OC_C1S7*x[1,1]>>16
michael@0 1099 PKHBT r2, r2, r11 ; r2 = <0|t[0,2]>
michael@0 1100 SMULWT r7, r7, r4 ; r7 = t[0,7]=OC_C1S7*x[0,1]>>16
michael@0 1101 PKHBT r3, r3, r11 ; r3 = <0|t[0,3]>
michael@0 1102 SMULWT r5, r6, r5 ; r10= t[1,4]=OC_C7S1*x[1,1]>>16
michael@0 1103 PKHBT r7, r7, r10,LSL #16 ; r7 = <t[1,7]|t[0,7]>
michael@0 1104 SMULWT r4, r6, r4 ; r4 = t[0,4]=OC_C7S1*x[0,1]>>16
michael@0 1105 ; Stage 2:
michael@0 1106 SMULWB r6, r11,r7 ; r6 = t[0,6]=OC_C4S4*t[0,7]>>16
michael@0 1107 PKHBT r4, r4, r5, LSL #16 ; r4 = <t[1,4]|t[0,4]>
michael@0 1108 SMULWT r10,r11,r7 ; r10= t[1,6]=OC_C4S4*t[1,7]>>16
michael@0 1109 SMULWB r5, r11,r4 ; r5 = t[0,5]=OC_C4S4*t[0,4]>>16
michael@0 1110 PKHBT r6, r6, r10,LSL #16 ; r6 = <t[1,6]|t[0,6]>
michael@0 1111 SMULWT r10,r11,r4 ; r10= t[1,5]=OC_C4S4*t[1,4]>>16
michael@0 1112 ; Stage 3:
michael@0 1113 B idct4_3core_stage3_v6
michael@0 1114 ENDP
michael@0 1115
michael@0 1116 ; Another copy so the LDRD offsets are less than +/- 255.
michael@0 1117 ALIGN 8
michael@0 1118 OC_C7S1_3_v6
michael@0 1119 DCD 12785 ; 31F1
michael@0 1120 OC_C1S7_3_v6
michael@0 1121 DCD 64277 ; FB15
michael@0 1122 OC_C6S2_3_v6
michael@0 1123 DCD 25080 ; 61F8
michael@0 1124 OC_C2S6_3_v6
michael@0 1125 DCD 60547 ; EC83
michael@0 1126
michael@0 1127 idct3_3core_down_v6 PROC
michael@0 1128 ; r0 = ogg_int16_t *_y (destination)
michael@0 1129 ; r1 = const ogg_int16_t *_x (source)
michael@0 1130 ; Stage 1:
michael@0 1131 LDRD r10,[r1], #16 ; r10= <x[0,1]|x[0,0]>; r11= <??|x[0,2]>
michael@0 1132 LDRD r6, OC_C6S2_3_v6 ; r6 = OC_C6S2; r7 = OC_C2S6
michael@0 1133 LDR r4, [r1], #16 ; r4 = <x[1,1]|x[1,0]>
michael@0 1134 SMULWB r3, r7, r11 ; r3 = t[0,3]=OC_C2S6*x[0,2]>>16
michael@0 1135 MOV r7,#8
michael@0 1136 SMULWB r2, r6, r11 ; r2 = t[0,2]=OC_C6S2*x[0,2]>>16
michael@0 1137 LDR r11,OC_C4S4
michael@0 1138 SMLAWB r12,r11,r10,r7 ; r12= t[0,0]+8=(OC_C4S4*x[0,0]>>16)+8
michael@0 1139 ; Here we cheat: row 2 had just a DC, so x[0,2]==x[1,2] by definition.
michael@0 1140 PKHBT r3, r3, r3, LSL #16 ; r3 = <t[0,3]|t[0,3]>
michael@0 1141 SMLAWB r5, r11,r4, r7 ; r5 = t[1,0]+8=(OC_C4S4*x[1,0]>>16)+8
michael@0 1142 PKHBT r2, r2, r2, LSL #16 ; r2 = <t[0,2]|t[0,2]>
michael@0 1143 LDRD r6, OC_C7S1_3_v6 ; r6 = OC_C7S1; r7 = OC_C1S7
michael@0 1144 PKHBT r12,r12,r5, LSL #16 ; r12= <t[1,0]+8|t[0,0]+8>
michael@0 1145 SMULWT r5, r7, r4 ; r5 = t[1,7]=OC_C1S7*x[1,1]>>16
michael@0 1146 SMULWT r7, r7, r10 ; r7 = t[0,7]=OC_C1S7*x[0,1]>>16
michael@0 1147 SMULWT r10,r6, r10 ; r10= t[0,4]=OC_C7S1*x[0,1]>>16
michael@0 1148 PKHBT r7, r7, r5, LSL #16 ; r7 = <t[1,7]|t[0,7]>
michael@0 1149 SMULWT r4, r6, r4 ; r4 = t[1,4]=OC_C7S1*x[1,1]>>16
michael@0 1150 ; Stage 2:
michael@0 1151 SMULWB r6, r11,r7 ; r6 = t[0,6]=OC_C4S4*t[0,7]>>16
michael@0 1152 PKHBT r4, r10,r4, LSL #16 ; r4 = <t[1,4]|t[0,4]>
michael@0 1153 SMULWT r10,r11,r7 ; r10= t[1,6]=OC_C4S4*t[1,7]>>16
michael@0 1154 SMULWB r5, r11,r4 ; r5 = t[0,5]=OC_C4S4*t[0,4]>>16
michael@0 1155 PKHBT r6, r6, r10,LSL #16 ; r6 = <t[1,6]|t[0,6]>
michael@0 1156 SMULWT r10,r11,r4 ; r10= t[1,5]=OC_C4S4*t[1,4]>>16
michael@0 1157 ; Stage 3:
michael@0 1158 B idct4_4core_down_stage3_v6
michael@0 1159 ENDP
michael@0 1160 ]
michael@0 1161
michael@0 1162 idct4_3core_v6 PROC
michael@0 1163 ; r0 = ogg_int16_t *_y (destination)
michael@0 1164 ; r1 = const ogg_int16_t *_x (source)
michael@0 1165 ; Stage 1:
michael@0 1166 LDRD r10,[r1], #16 ; r10= <x[0,1]|x[0,0]>; r11= <x[0,3]|x[0,2]>
michael@0 1167 LDRD r2, OC_C5S3_4_v6 ; r2 = OC_C5S3; r3 = OC_C3S5
michael@0 1168 LDRD r4, [r1], #16 ; r4 = <x[1,1]|x[1,0]>; r5 = <??|x[1,2]>
michael@0 1169 SMULWT r9, r3, r11 ; r9 = t[0,6]=OC_C3S5*x[0,3]>>16
michael@0 1170 SMULWT r8, r2, r11 ; r8 = -t[0,5]=OC_C5S3*x[0,3]>>16
michael@0 1171 PKHBT r9, r9, r2 ; r9 = <0|t[0,6]>
michael@0 1172 LDRD r6, OC_C6S2_4_v6 ; r6 = OC_C6S2; r7 = OC_C2S6
michael@0 1173 PKHBT r8, r8, r2 ; r9 = <0|-t[0,5]>
michael@0 1174 SMULWB r3, r7, r11 ; r3 = t[0,3]=OC_C2S6*x[0,2]>>16
michael@0 1175 SMULWB r2, r6, r11 ; r2 = t[0,2]=OC_C6S2*x[0,2]>>16
michael@0 1176 LDR r11,OC_C4S4
michael@0 1177 SMULWB r12,r7, r5 ; r12= t[1,3]=OC_C2S6*x[1,2]>>16
michael@0 1178 SMULWB r5, r6, r5 ; r5 = t[1,2]=OC_C6S2*x[1,2]>>16
michael@0 1179 PKHBT r3, r3, r12,LSL #16 ; r3 = <t[1,3]|t[0,3]>
michael@0 1180 SMULWB r12,r11,r10 ; r12= t[0,0]=OC_C4S4*x[0,0]>>16
michael@0 1181 PKHBT r2, r2, r5, LSL #16 ; r2 = <t[1,2]|t[0,2]>
michael@0 1182 SMULWB r5, r11,r4 ; r5 = t[1,0]=OC_C4S4*x[1,0]>>16
michael@0 1183 LDRD r6, OC_C7S1_4_v6 ; r6 = OC_C7S1; r7 = OC_C1S7
michael@0 1184 PKHBT r12,r12,r5, LSL #16 ; r12= <t[1,0]|t[0,0]>
michael@0 1185 SMULWT r5, r7, r4 ; r5 = t[1,7]=OC_C1S7*x[1,1]>>16
michael@0 1186 SMULWT r7, r7, r10 ; r7 = t[0,7]=OC_C1S7*x[0,1]>>16
michael@0 1187 SMULWT r10,r6, r10 ; r10= t[0,4]=OC_C7S1*x[0,1]>>16
michael@0 1188 PKHBT r7, r7, r5, LSL #16 ; r7 = <t[1,7]|t[0,7]>
michael@0 1189 SMULWT r4, r6, r4 ; r4 = t[1,4]=OC_C7S1*x[1,1]>>16
michael@0 1190 ; Stage 2:
michael@0 1191 SSUB16 r6, r7, r9 ; r6 = t[7]-t[6]
michael@0 1192 PKHBT r4, r10,r4, LSL #16 ; r4 = <t[1,4]|t[0,4]>
michael@0 1193 SADD16 r7, r7, r9 ; r7 = t[7]=t[7]+t[6]
michael@0 1194 SMULWT r9, r11,r6 ; r9 = t[1,6]=OC_C4S4*r6T>>16
michael@0 1195 SADD16 r5, r4, r8 ; r5 = t[4]-t[5]
michael@0 1196 SMULWB r6, r11,r6 ; r6 = t[0,6]=OC_C4S4*r6B>>16
michael@0 1197 SSUB16 r4, r4, r8 ; r4 = t[4]=t[4]+t[5]
michael@0 1198 SMULWT r10,r11,r5 ; r10= t[1,5]=OC_C4S4*r5T>>16
michael@0 1199 PKHBT r6, r6, r9, LSL #16 ; r6 = <t[1,6]|t[0,6]>
michael@0 1200 SMULWB r5, r11,r5 ; r5 = t[0,5]=OC_C4S4*r5B>>16
michael@0 1201 ; Stage 3:
michael@0 1202 idct4_3core_stage3_v6
michael@0 1203 SADD16 r11,r12,r2 ; r11= t[1]=t[0]+t[2]
michael@0 1204 PKHBT r10,r5, r10,LSL #16 ; r10= <t[1,5]|t[0,5]>
michael@0 1205 SSUB16 r2, r12,r2 ; r2 = t[2]=t[0]-t[2]
michael@0 1206 idct4_3core_stage3_5_v6
michael@0 1207 SSUB16 r5, r6, r10 ; r5 = t[5]'=t[6]-t[5]
michael@0 1208 SADD16 r6, r6, r10 ; r6 = t[6]=t[6]+t[5]
michael@0 1209 SADD16 r10,r12,r3 ; r10= t[0]'=t[0]+t[3]
michael@0 1210 SSUB16 r3, r12,r3 ; r3 = t[3]=t[0]-t[3]
michael@0 1211 ; Stage 4:
michael@0 1212 SADD16 r12,r10,r7 ; r12= t[0]+t[7]
michael@0 1213 STR r12,[r0], #4 ; y[0<<3] = t[0]+t[7]
michael@0 1214 SADD16 r12,r11,r6 ; r12= t[1]+t[6]
michael@0 1215 STR r12,[r0, #12] ; y[1<<3] = t[1]+t[6]
michael@0 1216 SADD16 r12,r2, r5 ; r12= t[2]+t[5]
michael@0 1217 STR r12,[r0, #28] ; y[2<<3] = t[2]+t[5]
michael@0 1218 SADD16 r12,r3, r4 ; r12= t[3]+t[4]
michael@0 1219 STR r12,[r0, #44] ; y[3<<3] = t[3]+t[4]
michael@0 1220 SSUB16 r4, r3, r4 ; r4 = t[3]-t[4]
michael@0 1221 STR r4, [r0, #60] ; y[4<<3] = t[3]-t[4]
michael@0 1222 SSUB16 r5, r2, r5 ; r5 = t[2]-t[5]
michael@0 1223 STR r5, [r0, #76] ; y[5<<3] = t[2]-t[5]
michael@0 1224 SSUB16 r6, r11,r6 ; r6 = t[1]-t[6]
michael@0 1225 STR r6, [r0, #92] ; y[6<<3] = t[1]-t[6]
michael@0 1226 SSUB16 r7, r10,r7 ; r7 = t[0]-t[7]
michael@0 1227 STR r7, [r0, #108] ; y[7<<3] = t[0]-t[7]
michael@0 1228 MOV PC,r14
michael@0 1229 ENDP
michael@0 1230
michael@0 1231 ; Another copy so the LDRD offsets are less than +/- 255.
michael@0 1232 ALIGN 8
michael@0 1233 OC_C7S1_4_v6
michael@0 1234 DCD 12785 ; 31F1
michael@0 1235 OC_C1S7_4_v6
michael@0 1236 DCD 64277 ; FB15
michael@0 1237 OC_C6S2_4_v6
michael@0 1238 DCD 25080 ; 61F8
michael@0 1239 OC_C2S6_4_v6
michael@0 1240 DCD 60547 ; EC83
michael@0 1241 OC_C5S3_4_v6
michael@0 1242 DCD 36410 ; 8E3A
michael@0 1243 OC_C3S5_4_v6
michael@0 1244 DCD 54491 ; D4DB
michael@0 1245
michael@0 1246 idct4_4core_down_v6 PROC
michael@0 1247 ; r0 = ogg_int16_t *_y (destination)
michael@0 1248 ; r1 = const ogg_int16_t *_x (source)
michael@0 1249 ; Stage 1:
michael@0 1250 LDRD r10,[r1], #16 ; r10= <x[0,1]|x[0,0]>; r11= <x[0,3]|x[0,2]>
michael@0 1251 LDRD r2, OC_C5S3_4_v6 ; r2 = OC_C5S3; r3 = OC_C3S5
michael@0 1252 LDRD r4, [r1], #16 ; r4 = <x[1,1]|x[1,0]>; r5 = <x[1,3]|x[1,2]>
michael@0 1253 SMULWT r9, r3, r11 ; r9 = t[0,6]=OC_C3S5*x[0,3]>>16
michael@0 1254 LDRD r6, OC_C6S2_4_v6 ; r6 = OC_C6S2; r7 = OC_C2S6
michael@0 1255 SMULWT r8, r2, r11 ; r8 = -t[0,5]=OC_C5S3*x[0,3]>>16
michael@0 1256 ; Here we cheat: row 3 had just a DC, so x[0,3]==x[1,3] by definition.
michael@0 1257 PKHBT r9, r9, r9, LSL #16 ; r9 = <t[0,6]|t[0,6]>
michael@0 1258 SMULWB r3, r7, r11 ; r3 = t[0,3]=OC_C2S6*x[0,2]>>16
michael@0 1259 PKHBT r8, r8, r8, LSL #16 ; r8 = <-t[0,5]|-t[0,5]>
michael@0 1260 SMULWB r2, r6, r11 ; r2 = t[0,2]=OC_C6S2*x[0,2]>>16
michael@0 1261 LDR r11,OC_C4S4
michael@0 1262 SMULWB r12,r7, r5 ; r12= t[1,3]=OC_C2S6*x[1,2]>>16
michael@0 1263 MOV r7,#8
michael@0 1264 SMULWB r5, r6, r5 ; r5 = t[1,2]=OC_C6S2*x[1,2]>>16
michael@0 1265 PKHBT r3, r3, r12,LSL #16 ; r3 = <t[1,3]|t[0,3]>
michael@0 1266 SMLAWB r12,r11,r10,r7 ; r12= t[0,0]+8=(OC_C4S4*x[0,0]>>16)+8
michael@0 1267 PKHBT r2, r2, r5, LSL #16 ; r2 = <t[1,2]|t[0,2]>
michael@0 1268 SMLAWB r5, r11,r4 ,r7 ; r5 = t[1,0]+8=(OC_C4S4*x[1,0]>>16)+8
michael@0 1269 LDRD r6, OC_C7S1_4_v6 ; r6 = OC_C7S1; r7 = OC_C1S7
michael@0 1270 PKHBT r12,r12,r5, LSL #16 ; r12= <t[1,0]+8|t[0,0]+8>
michael@0 1271 SMULWT r5, r7, r4 ; r5 = t[1,7]=OC_C1S7*x[1,1]>>16
michael@0 1272 SMULWT r7, r7, r10 ; r7 = t[0,7]=OC_C1S7*x[0,1]>>16
michael@0 1273 SMULWT r10,r6, r10 ; r10= t[0,4]=OC_C7S1*x[0,1]>>16
michael@0 1274 PKHBT r7, r7, r5, LSL #16 ; r7 = <t[1,7]|t[0,7]>
michael@0 1275 SMULWT r4, r6, r4 ; r4 = t[1,4]=OC_C7S1*x[1,1]>>16
michael@0 1276 ; Stage 2:
michael@0 1277 SSUB16 r6, r7, r9 ; r6 = t[7]-t[6]
michael@0 1278 PKHBT r4, r10,r4, LSL #16 ; r4 = <t[1,4]|t[0,4]>
michael@0 1279 SADD16 r7, r7, r9 ; r7 = t[7]=t[7]+t[6]
michael@0 1280 SMULWT r9, r11,r6 ; r9 = t[1,6]=OC_C4S4*r6T>>16
michael@0 1281 SADD16 r5, r4, r8 ; r5 = t[4]-t[5]
michael@0 1282 SMULWB r6, r11,r6 ; r6 = t[0,6]=OC_C4S4*r6B>>16
michael@0 1283 SSUB16 r4, r4, r8 ; r4 = t[4]=t[4]+t[5]
michael@0 1284 SMULWT r10,r11,r5 ; r10= t[1,5]=OC_C4S4*r5T>>16
michael@0 1285 PKHBT r6, r6, r9, LSL #16 ; r6 = <t[1,6]|t[0,6]>
michael@0 1286 SMULWB r5, r11,r5 ; r5 = t[0,5]=OC_C4S4*r5B>>16
michael@0 1287 ; Stage 3:
michael@0 1288 idct4_4core_down_stage3_v6
michael@0 1289 SADD16 r11,r12,r2 ; r11= t[1]+8=t[0]+t[2]+8
michael@0 1290 PKHBT r10,r5, r10,LSL #16 ; r10= <t[1,5]|t[0,5]>
michael@0 1291 SSUB16 r2, r12,r2 ; r2 = t[2]+8=t[0]-t[2]+8
michael@0 1292 B idct8_8core_down_stage3_5_v6
michael@0 1293 ENDP
michael@0 1294
michael@0 1295 idct8_8core_v6 PROC
michael@0 1296 STMFD r13!,{r0,r14}
michael@0 1297 ; Stage 1:
michael@0 1298 ;5-6 rotation by 3pi/16
michael@0 1299 LDRD r10,OC_C5S3_4_v6 ; r10= OC_C5S3, r11= OC_C3S5
michael@0 1300 LDR r4, [r1,#8] ; r4 = <x[0,5]|x[0,4]>
michael@0 1301 LDR r7, [r1,#24] ; r7 = <x[1,5]|x[1,4]>
michael@0 1302 SMULWT r5, r11,r4 ; r5 = OC_C3S5*x[0,5]>>16
michael@0 1303 LDR r0, [r1,#4] ; r0 = <x[0,3]|x[0,2]>
michael@0 1304 SMULWT r3, r11,r7 ; r3 = OC_C3S5*x[1,5]>>16
michael@0 1305 LDR r12,[r1,#20] ; r12= <x[1,3]|x[1,2]>
michael@0 1306 SMULWT r6, r11,r0 ; r6 = OC_C3S5*x[0,3]>>16
michael@0 1307 SMULWT r11,r11,r12 ; r11= OC_C3S5*x[1,3]>>16
michael@0 1308 SMLAWT r6, r10,r4, r6 ; r6 = t[0,6]=r6+(OC_C5S3*x[0,5]>>16)
michael@0 1309 PKHBT r5, r5, r3, LSL #16 ; r5 = <r3|r5>
michael@0 1310 SMLAWT r11,r10,r7, r11 ; r11= t[1,6]=r11+(OC_C5S3*x[1,5]>>16)
michael@0 1311 PKHBT r4, r4, r7, LSL #16 ; r4 = <x[1,4]|x[0,4]>
michael@0 1312 SMULWT r3, r10,r0 ; r3 = OC_C5S3*x[0,3]>>16
michael@0 1313 PKHBT r6, r6, r11,LSL #16 ; r6 = <t[1,6]|t[0,6]>
michael@0 1314 SMULWT r8, r10,r12 ; r8 = OC_C5S3*x[1,3]>>16
michael@0 1315 ;2-3 rotation by 6pi/16
michael@0 1316 LDRD r10,OC_C6S2_4_v6 ; r10= OC_C6S2, r11= OC_C2S6
michael@0 1317 PKHBT r3, r3, r8, LSL #16 ; r3 = <r8|r3>
michael@0 1318 LDR r8, [r1,#12] ; r8 = <x[0,7]|x[0,6]>
michael@0 1319 SMULWB r2, r10,r0 ; r2 = OC_C6S2*x[0,2]>>16
michael@0 1320 SSUB16 r5, r5, r3 ; r5 = <t[1,5]|t[0,5]>
michael@0 1321 SMULWB r9, r10,r12 ; r9 = OC_C6S2*x[1,2]>>16
michael@0 1322 LDR r7, [r1,#28] ; r7 = <x[1,7]|x[1,6]>
michael@0 1323 SMULWB r3, r10,r8 ; r3 = OC_C6S2*x[0,6]>>16
michael@0 1324 SMULWB r10,r10,r7 ; r10= OC_C6S2*x[1,6]>>16
michael@0 1325 PKHBT r2, r2, r9, LSL #16 ; r2 = <r2|r9>
michael@0 1326 SMLAWB r3, r11,r0, r3 ; r3 = t[0,3]=r3+(OC_C2S6*x[0,2]>>16)
michael@0 1327 SMLAWB r10,r11,r12,r10 ; r10= t[1,3]=r10+(OC_C2S6*x[1,2]>>16)
michael@0 1328 SMULWB r9, r11,r8 ; r9 = OC_C2S6*x[0,6]>>16
michael@0 1329 PKHBT r3, r3, r10,LSL #16 ; r3 = <t[1,6]|t[0,6]>
michael@0 1330 SMULWB r12,r11,r7 ; r12= OC_C2S6*x[1,6]>>16
michael@0 1331 ;4-7 rotation by 7pi/16
michael@0 1332 LDRD r10,OC_C7S1_8_v6 ; r10= OC_C7S1, r11= OC_C1S7
michael@0 1333 PKHBT r9, r9, r12,LSL #16 ; r9 = <r9|r12>
michael@0 1334 LDR r0, [r1],#16 ; r0 = <x[0,1]|x[0,0]>
michael@0 1335 PKHTB r7, r7, r8, ASR #16 ; r7 = <x[1,7]|x[0,7]>
michael@0 1336 SSUB16 r2, r2, r9 ; r2 = <t[1,2]|t[0,2]>
michael@0 1337 SMULWB r9, r10,r7 ; r9 = OC_C7S1*x[0,7]>>16
michael@0 1338 LDR r14,[r1],#16 ; r14= <x[1,1]|x[1,0]>
michael@0 1339 SMULWT r12,r10,r7 ; r12= OC_C7S1*x[1,7]>>16
michael@0 1340 SMULWT r8, r10,r0 ; r8 = OC_C7S1*x[0,1]>>16
michael@0 1341 SMULWT r10,r10,r14 ; r10= OC_C7S1*x[1,1]>>16
michael@0 1342 SMLAWT r9, r11,r0, r9 ; r9 = t[0,7]=r9+(OC_C1S7*x[0,1]>>16)
michael@0 1343 PKHBT r8, r8, r10,LSL #16 ; r8 = <r12|r8>
michael@0 1344 SMLAWT r12,r11,r14,r12 ; r12= t[1,7]=r12+(OC_C1S7*x[1,1]>>16)
michael@0 1345 PKHBT r0, r0, r14,LSL #16 ; r0 = <x[1,0]|x[0,0]>
michael@0 1346 SMULWB r10,r11,r7 ; r10= OC_C1S7*x[0,6]>>16
michael@0 1347 PKHBT r9, r9, r12,LSL #16 ; r9 = <t[1,7]|t[0,7]>
michael@0 1348 SMULWT r12,r11,r7 ; r12= OC_C1S7*x[1,6]>>16
michael@0 1349 ;0-1 butterfly
michael@0 1350 LDR r11,OC_C4S4
michael@0 1351 PKHBT r10,r10,r12,LSL #16 ; r10= <r12|r10>
michael@0 1352 SADD16 r7, r0, r4 ; r7 = x[0]+x[4]
michael@0 1353 SSUB16 r10,r8, r10 ; r10= <t[1,4]|t[0,4]>
michael@0 1354 SSUB16 r4, r0, r4 ; r4 = x[0]-x[4]
michael@0 1355 SMULWB r8, r11,r7 ; r8 = t[0,0]=OC_C4S4*r7B>>16
michael@0 1356 SMULWT r12,r11,r7 ; r12= t[1,0]=OC_C4S4*r7T>>16
michael@0 1357 SMULWB r7, r11,r4 ; r7 = t[0,1]=OC_C4S4*r4B>>16
michael@0 1358 PKHBT r12,r8, r12,LSL #16 ; r12= <t[1,0]|t[0,0]>
michael@0 1359 SMULWT r8, r11,r4 ; r8 = t[1,1]=OC_C4S4*r4T>>16
michael@0 1360 ; Stage 2:
michael@0 1361 SADD16 r4, r10,r5 ; r4 = t[4]'=t[4]+t[5]
michael@0 1362 PKHBT r8, r7, r8, LSL #16 ; r8 = <t[1,0]|t[0,0]>
michael@0 1363 SSUB16 r5, r10,r5 ; r5 = t[4]-t[5]
michael@0 1364 SMULWB r10,r11,r5 ; r10= t[0,5]=OC_C4S4*r5B>>16
michael@0 1365 SADD16 r7, r9, r6 ; r7 = t[7]'=t[7]+t[6]
michael@0 1366 SMULWT r5, r11,r5 ; r5 = t[1,5]=OC_C4S4*r5T>>16
michael@0 1367 SSUB16 r6, r9, r6 ; r6 = t[7]-t[6]
michael@0 1368 SMULWB r9, r11,r6 ; r9 = t[0,6]=OC_C4S4*r6B>>16
michael@0 1369 PKHBT r10,r10,r5, LSL #16 ; r10= <t[1,5]|t[0,5]>
michael@0 1370 SMULWT r6, r11,r6 ; r6 = t[1,6]=OC_C4S4*r6T>>16
michael@0 1371 ; Stage 3:
michael@0 1372 SADD16 r11,r8, r2 ; r11= t[1]'=t[1]+t[2]
michael@0 1373 PKHBT r6, r9, r6, LSL #16 ; r6 = <t[1,6]|t[0,6]>
michael@0 1374 SSUB16 r2, r8, r2 ; r2 = t[2]=t[1]-t[2]
michael@0 1375 LDMFD r13!,{r0,r14}
michael@0 1376 B idct4_3core_stage3_5_v6
michael@0 1377 ENDP
michael@0 1378
michael@0 1379 ; Another copy so the LDRD offsets are less than +/- 255.
michael@0 1380 ALIGN 8
michael@0 1381 OC_C7S1_8_v6
michael@0 1382 DCD 12785 ; 31F1
michael@0 1383 OC_C1S7_8_v6
michael@0 1384 DCD 64277 ; FB15
michael@0 1385 OC_C6S2_8_v6
michael@0 1386 DCD 25080 ; 61F8
michael@0 1387 OC_C2S6_8_v6
michael@0 1388 DCD 60547 ; EC83
michael@0 1389 OC_C5S3_8_v6
michael@0 1390 DCD 36410 ; 8E3A
michael@0 1391 OC_C3S5_8_v6
michael@0 1392 DCD 54491 ; D4DB
michael@0 1393
michael@0 1394 idct8_8core_down_v6 PROC
michael@0 1395 STMFD r13!,{r0,r14}
michael@0 1396 ; Stage 1:
michael@0 1397 ;5-6 rotation by 3pi/16
michael@0 1398 LDRD r10,OC_C5S3_8_v6 ; r10= OC_C5S3, r11= OC_C3S5
michael@0 1399 LDR r4, [r1,#8] ; r4 = <x[0,5]|x[0,4]>
michael@0 1400 LDR r7, [r1,#24] ; r7 = <x[1,5]|x[1,4]>
michael@0 1401 SMULWT r5, r11,r4 ; r5 = OC_C3S5*x[0,5]>>16
michael@0 1402 LDR r0, [r1,#4] ; r0 = <x[0,3]|x[0,2]>
michael@0 1403 SMULWT r3, r11,r7 ; r3 = OC_C3S5*x[1,5]>>16
michael@0 1404 LDR r12,[r1,#20] ; r12= <x[1,3]|x[1,2]>
michael@0 1405 SMULWT r6, r11,r0 ; r6 = OC_C3S5*x[0,3]>>16
michael@0 1406 SMULWT r11,r11,r12 ; r11= OC_C3S5*x[1,3]>>16
michael@0 1407 SMLAWT r6, r10,r4, r6 ; r6 = t[0,6]=r6+(OC_C5S3*x[0,5]>>16)
michael@0 1408 PKHBT r5, r5, r3, LSL #16 ; r5 = <r3|r5>
michael@0 1409 SMLAWT r11,r10,r7, r11 ; r11= t[1,6]=r11+(OC_C5S3*x[1,5]>>16)
michael@0 1410 PKHBT r4, r4, r7, LSL #16 ; r4 = <x[1,4]|x[0,4]>
michael@0 1411 SMULWT r3, r10,r0 ; r3 = OC_C5S3*x[0,3]>>16
michael@0 1412 PKHBT r6, r6, r11,LSL #16 ; r6 = <t[1,6]|t[0,6]>
michael@0 1413 SMULWT r8, r10,r12 ; r8 = OC_C5S3*x[1,3]>>16
michael@0 1414 ;2-3 rotation by 6pi/16
michael@0 1415 LDRD r10,OC_C6S2_8_v6 ; r10= OC_C6S2, r11= OC_C2S6
michael@0 1416 PKHBT r3, r3, r8, LSL #16 ; r3 = <r8|r3>
michael@0 1417 LDR r8, [r1,#12] ; r8 = <x[0,7]|x[0,6]>
michael@0 1418 SMULWB r2, r10,r0 ; r2 = OC_C6S2*x[0,2]>>16
michael@0 1419 SSUB16 r5, r5, r3 ; r5 = <t[1,5]|t[0,5]>
michael@0 1420 SMULWB r9, r10,r12 ; r9 = OC_C6S2*x[1,2]>>16
michael@0 1421 LDR r7, [r1,#28] ; r7 = <x[1,7]|x[1,6]>
michael@0 1422 SMULWB r3, r10,r8 ; r3 = OC_C6S2*x[0,6]>>16
michael@0 1423 SMULWB r10,r10,r7 ; r10= OC_C6S2*x[1,6]>>16
michael@0 1424 PKHBT r2, r2, r9, LSL #16 ; r2 = <r2|r9>
michael@0 1425 SMLAWB r3, r11,r0, r3 ; r3 = t[0,3]=r3+(OC_C2S6*x[0,2]>>16)
michael@0 1426 SMLAWB r10,r11,r12,r10 ; r10= t[1,3]=r10+(OC_C2S6*x[1,2]>>16)
michael@0 1427 SMULWB r9, r11,r8 ; r9 = OC_C2S6*x[0,6]>>16
michael@0 1428 PKHBT r3, r3, r10,LSL #16 ; r3 = <t[1,6]|t[0,6]>
michael@0 1429 SMULWB r12,r11,r7 ; r12= OC_C2S6*x[1,6]>>16
michael@0 1430 ;4-7 rotation by 7pi/16
michael@0 1431 LDRD r10,OC_C7S1_8_v6 ; r10= OC_C7S1, r11= OC_C1S7
michael@0 1432 PKHBT r9, r9, r12,LSL #16 ; r9 = <r9|r12>
michael@0 1433 LDR r0, [r1],#16 ; r0 = <x[0,1]|x[0,0]>
michael@0 1434 PKHTB r7, r7, r8, ASR #16 ; r7 = <x[1,7]|x[0,7]>
michael@0 1435 SSUB16 r2, r2, r9 ; r2 = <t[1,2]|t[0,2]>
michael@0 1436 SMULWB r9, r10,r7 ; r9 = OC_C7S1*x[0,7]>>16
michael@0 1437 LDR r14,[r1],#16 ; r14= <x[1,1]|x[1,0]>
michael@0 1438 SMULWT r12,r10,r7 ; r12= OC_C7S1*x[1,7]>>16
michael@0 1439 SMULWT r8, r10,r0 ; r8 = OC_C7S1*x[0,1]>>16
michael@0 1440 SMULWT r10,r10,r14 ; r10= OC_C7S1*x[1,1]>>16
michael@0 1441 SMLAWT r9, r11,r0, r9 ; r9 = t[0,7]=r9+(OC_C1S7*x[0,1]>>16)
michael@0 1442 PKHBT r8, r8, r10,LSL #16 ; r8 = <r12|r8>
michael@0 1443 SMLAWT r12,r11,r14,r12 ; r12= t[1,7]=r12+(OC_C1S7*x[1,1]>>16)
michael@0 1444 PKHBT r0, r0, r14,LSL #16 ; r0 = <x[1,0]|x[0,0]>
michael@0 1445 SMULWB r10,r11,r7 ; r10= OC_C1S7*x[0,6]>>16
michael@0 1446 PKHBT r9, r9, r12,LSL #16 ; r9 = <t[1,7]|t[0,7]>
michael@0 1447 SMULWT r12,r11,r7 ; r12= OC_C1S7*x[1,6]>>16
michael@0 1448 ;0-1 butterfly
michael@0 1449 LDR r11,OC_C4S4
michael@0 1450 MOV r14,#8
michael@0 1451 PKHBT r10,r10,r12,LSL #16 ; r10= <r12|r10>
michael@0 1452 SADD16 r7, r0, r4 ; r7 = x[0]+x[4]
michael@0 1453 SSUB16 r10,r8, r10 ; r10= <t[1,4]|t[0,4]>
michael@0 1454 SMLAWB r8, r11,r7, r14 ; r8 = t[0,0]+8=(OC_C4S4*r7B>>16)+8
michael@0 1455 SSUB16 r4, r0, r4 ; r4 = x[0]-x[4]
michael@0 1456 SMLAWT r12,r11,r7, r14 ; r12= t[1,0]+8=(OC_C4S4*r7T>>16)+8
michael@0 1457 SMLAWB r7, r11,r4, r14 ; r7 = t[0,1]+8=(OC_C4S4*r4B>>16)+8
michael@0 1458 PKHBT r12,r8, r12,LSL #16 ; r12= <t[1,0]+8|t[0,0]+8>
michael@0 1459 SMLAWT r8, r11,r4, r14 ; r8 = t[1,1]+8=(OC_C4S4*r4T>>16)+8
michael@0 1460 ; Stage 2:
michael@0 1461 SADD16 r4, r10,r5 ; r4 = t[4]'=t[4]+t[5]
michael@0 1462 PKHBT r8, r7, r8, LSL #16 ; r8 = <t[1,0]+8|t[0,0]+8>
michael@0 1463 SSUB16 r5, r10,r5 ; r5 = t[4]-t[5]
michael@0 1464 SMULWB r10,r11,r5 ; r10= t[0,5]=OC_C4S4*r5B>>16
michael@0 1465 SADD16 r7, r9, r6 ; r7 = t[7]'=t[7]+t[6]
michael@0 1466 SMULWT r5, r11,r5 ; r5 = t[1,5]=OC_C4S4*r5T>>16
michael@0 1467 SSUB16 r6, r9, r6 ; r6 = t[7]-t[6]
michael@0 1468 SMULWB r9, r11,r6 ; r9 = t[0,6]=OC_C4S4*r6B>>16
michael@0 1469 PKHBT r10,r10,r5, LSL #16 ; r10= <t[1,5]|t[0,5]>
michael@0 1470 SMULWT r6, r11,r6 ; r6 = t[1,6]=OC_C4S4*r6T>>16
michael@0 1471 ; Stage 3:
michael@0 1472 SADD16 r11,r8, r2 ; r11= t[1]'+8=t[1]+t[2]+8
michael@0 1473 PKHBT r6, r9, r6, LSL #16 ; r6 = <t[1,6]|t[0,6]>
michael@0 1474 SSUB16 r2, r8, r2 ; r2 = t[2]+8=t[1]-t[2]+8
michael@0 1475 LDMFD r13!,{r0,r14}
michael@0 1476 idct8_8core_down_stage3_5_v6
michael@0 1477 SSUB16 r5, r6, r10 ; r5 = t[5]'=t[6]-t[5]
michael@0 1478 SADD16 r6, r6, r10 ; r6 = t[6]=t[6]+t[5]
michael@0 1479 SADD16 r10,r12,r3 ; r10= t[0]'+8=t[0]+t[3]+8
michael@0 1480 SSUB16 r3, r12,r3 ; r3 = t[3]+8=t[0]-t[3]+8
michael@0 1481 ; Stage 4:
michael@0 1482 SADD16 r12,r10,r7 ; r12= t[0]+t[7]+8
michael@0 1483 SSUB16 r7, r10,r7 ; r7 = t[0]-t[7]+8
michael@0 1484 MOV r10,r12,ASR #4
michael@0 1485 MOV r12,r12,LSL #16
michael@0 1486 PKHTB r10,r10,r12,ASR #20 ; r10= t[0]+t[7]+8>>4
michael@0 1487 STR r10,[r0], #4 ; y[0<<3] = t[0]+t[7]+8>>4
michael@0 1488 SADD16 r12,r11,r6 ; r12= t[1]+t[6]+8
michael@0 1489 SSUB16 r6, r11,r6 ; r6 = t[1]-t[6]+8
michael@0 1490 MOV r10,r12,ASR #4
michael@0 1491 MOV r12,r12,LSL #16
michael@0 1492 PKHTB r10,r10,r12,ASR #20 ; r10= t[1]+t[6]+8>>4
michael@0 1493 STR r10,[r0, #12] ; y[1<<3] = t[1]+t[6]+8>>4
michael@0 1494 SADD16 r12,r2, r5 ; r12= t[2]+t[5]+8
michael@0 1495 SSUB16 r5, r2, r5 ; r5 = t[2]-t[5]+8
michael@0 1496 MOV r10,r12,ASR #4
michael@0 1497 MOV r12,r12,LSL #16
michael@0 1498 PKHTB r10,r10,r12,ASR #20 ; r10= t[2]+t[5]+8>>4
michael@0 1499 STR r10,[r0, #28] ; y[2<<3] = t[2]+t[5]+8>>4
michael@0 1500 SADD16 r12,r3, r4 ; r12= t[3]+t[4]+8
michael@0 1501 SSUB16 r4, r3, r4 ; r4 = t[3]-t[4]+8
michael@0 1502 MOV r10,r12,ASR #4
michael@0 1503 MOV r12,r12,LSL #16
michael@0 1504 PKHTB r10,r10,r12,ASR #20 ; r10= t[3]+t[4]+8>>4
michael@0 1505 STR r10,[r0, #44] ; y[3<<3] = t[3]+t[4]+8>>4
michael@0 1506 MOV r10,r4, ASR #4
michael@0 1507 MOV r4, r4, LSL #16
michael@0 1508 PKHTB r10,r10,r4, ASR #20 ; r10= t[3]-t[4]+8>>4
michael@0 1509 STR r10,[r0, #60] ; y[4<<3] = t[3]-t[4]+8>>4
michael@0 1510 MOV r10,r5, ASR #4
michael@0 1511 MOV r5, r5, LSL #16
michael@0 1512 PKHTB r10,r10,r5, ASR #20 ; r10= t[2]-t[5]+8>>4
michael@0 1513 STR r10,[r0, #76] ; y[5<<3] = t[2]-t[5]+8>>4
michael@0 1514 MOV r10,r6, ASR #4
michael@0 1515 MOV r6, r6, LSL #16
michael@0 1516 PKHTB r10,r10,r6, ASR #20 ; r10= t[1]-t[6]+8>>4
michael@0 1517 STR r10,[r0, #92] ; y[6<<3] = t[1]-t[6]+8>>4
michael@0 1518 MOV r10,r7, ASR #4
michael@0 1519 MOV r7, r7, LSL #16
michael@0 1520 PKHTB r10,r10,r7, ASR #20 ; r10= t[0]-t[7]+8>>4
michael@0 1521 STR r10,[r0, #108] ; y[7<<3] = t[0]-t[7]+8>>4
michael@0 1522 MOV PC,r14
michael@0 1523 ENDP
michael@0 1524 ]
michael@0 1525
michael@0 1526 [ OC_ARM_ASM_NEON
michael@0 1527 EXPORT oc_idct8x8_1_neon
michael@0 1528 EXPORT oc_idct8x8_neon
michael@0 1529
michael@0 1530 ALIGN 16
michael@0 1531 OC_IDCT_CONSTS_NEON
michael@0 1532 DCW 8
michael@0 1533 DCW 64277 ; FB15 (C1S7)
michael@0 1534 DCW 60547 ; EC83 (C2S6)
michael@0 1535 DCW 54491 ; D4DB (C3S5)
michael@0 1536 DCW 46341 ; B505 (C4S4)
michael@0 1537 DCW 36410 ; 471D (C5S3)
michael@0 1538 DCW 25080 ; 30FC (C6S2)
michael@0 1539 DCW 12785 ; 31F1 (C7S1)
michael@0 1540
michael@0 1541 oc_idct8x8_1_neon PROC
michael@0 1542 ; r0 = ogg_int16_t *_y
michael@0 1543 ; r1 = ogg_uint16_t _dc
michael@0 1544 VDUP.S16 Q0, r1
michael@0 1545 VMOV Q1, Q0
michael@0 1546 VST1.64 {D0, D1, D2, D3}, [r0@128]!
michael@0 1547 VST1.64 {D0, D1, D2, D3}, [r0@128]!
michael@0 1548 VST1.64 {D0, D1, D2, D3}, [r0@128]!
michael@0 1549 VST1.64 {D0, D1, D2, D3}, [r0@128]
michael@0 1550 MOV PC, r14
michael@0 1551 ENDP
michael@0 1552
michael@0 1553 oc_idct8x8_neon PROC
michael@0 1554 ; r0 = ogg_int16_t *_y
michael@0 1555 ; r1 = ogg_int16_t *_x
michael@0 1556 ; r2 = int _last_zzi
michael@0 1557 CMP r2, #10
michael@0 1558 BLE oc_idct8x8_10_neon
michael@0 1559 oc_idct8x8_slow_neon
michael@0 1560 VPUSH {D8-D15}
michael@0 1561 MOV r2, r1
michael@0 1562 ADR r3, OC_IDCT_CONSTS_NEON
michael@0 1563 ; Row transforms (input is pre-transposed)
michael@0 1564 VLD1.64 {D16,D17,D18,D19}, [r2@128]!
michael@0 1565 VLD1.64 {D20,D21,D22,D23}, [r2@128]!
michael@0 1566 VLD1.64 {D24,D25,D26,D27}, [r2@128]!
michael@0 1567 VSUB.S16 Q1, Q8, Q12 ; Q8 = x[0]-x[4]
michael@0 1568 VLD1.64 {D28,D29,D30,D31}, [r2@128]
michael@0 1569 VADD.S16 Q8, Q8, Q12 ; Q1 = x[0]+x[4]
michael@0 1570 VLD1.64 {D0,D1}, [r3@128]
michael@0 1571 MOV r12, r14
michael@0 1572 BL oc_idct8x8_stage123_neon
michael@0 1573 ; Stage 4
michael@0 1574 VSUB.S16 Q15,Q8, Q7 ; Q15 = y[7]=t[0]'-t[7]'
michael@0 1575 VADD.S16 Q8, Q8, Q7 ; Q8 = y[0]=t[0]'+t[7]'
michael@0 1576 VSUB.S16 Q14,Q9, Q3 ; Q14 = y[6]=t[1]'-t[6]''
michael@0 1577 VADD.S16 Q9, Q9, Q3 ; Q9 = y[1]=t[1]'+t[6]''
michael@0 1578 VSUB.S16 Q13,Q10,Q5 ; Q13 = y[5]=t[2]'-t[5]''
michael@0 1579 VADD.S16 Q10,Q10,Q5 ; Q10 = y[2]=t[2]'+t[5]''
michael@0 1580 VTRN.16 Q14,Q15
michael@0 1581 VSUB.S16 Q12,Q11,Q4 ; Q12 = y[4]=t[3]'-t[4]'
michael@0 1582 VADD.S16 Q11,Q11,Q4 ; Q11 = y[3]=t[3]'+t[4]'
michael@0 1583 ; 8x8 Transpose
michael@0 1584 VTRN.16 Q8, Q9
michael@0 1585 VTRN.16 Q10,Q11
michael@0 1586 VTRN.16 Q12,Q13
michael@0 1587 VTRN.32 Q8, Q10
michael@0 1588 VTRN.32 Q9, Q11
michael@0 1589 VTRN.32 Q12,Q14
michael@0 1590 VTRN.32 Q13,Q15
michael@0 1591 VSWP D17,D24
michael@0 1592 VSUB.S16 Q1, Q8, Q12 ; Q8 = x[0]-x[4]
michael@0 1593 VSWP D19,D26
michael@0 1594 VADD.S16 Q8, Q8, Q12 ; Q1 = x[0]+x[4]
michael@0 1595 VSWP D21,D28
michael@0 1596 VSWP D23,D30
michael@0 1597 ; Column transforms
michael@0 1598 BL oc_idct8x8_stage123_neon
michael@0 1599 CMP r0,r1
michael@0 1600 ; We have to put the return address back in the LR, or the branch
michael@0 1601 ; predictor will not recognize the function return and mis-predict the
michael@0 1602 ; entire call stack.
michael@0 1603 MOV r14, r12
michael@0 1604 ; Stage 4
michael@0 1605 VSUB.S16 Q15,Q8, Q7 ; Q15 = y[7]=t[0]'-t[7]'
michael@0 1606 VADD.S16 Q8, Q8, Q7 ; Q8 = y[0]=t[0]'+t[7]'
michael@0 1607 VSUB.S16 Q14,Q9, Q3 ; Q14 = y[6]=t[1]'-t[6]''
michael@0 1608 VADD.S16 Q9, Q9, Q3 ; Q9 = y[1]=t[1]'+t[6]''
michael@0 1609 VSUB.S16 Q13,Q10,Q5 ; Q13 = y[5]=t[2]'-t[5]''
michael@0 1610 VADD.S16 Q10,Q10,Q5 ; Q10 = y[2]=t[2]'+t[5]''
michael@0 1611 VSUB.S16 Q12,Q11,Q4 ; Q12 = y[4]=t[3]'-t[4]'
michael@0 1612 VADD.S16 Q11,Q11,Q4 ; Q11 = y[3]=t[3]'+t[4]'
michael@0 1613 BEQ oc_idct8x8_slow_neon_noclear
michael@0 1614 VMOV.I8 Q2,#0
michael@0 1615 VPOP {D8-D15}
michael@0 1616 VMOV.I8 Q3,#0
michael@0 1617 VRSHR.S16 Q8, Q8, #4 ; Q8 = y[0]+8>>4
michael@0 1618 VST1.64 {D4, D5, D6, D7}, [r1@128]!
michael@0 1619 VRSHR.S16 Q9, Q9, #4 ; Q9 = y[1]+8>>4
michael@0 1620 VRSHR.S16 Q10,Q10,#4 ; Q10 = y[2]+8>>4
michael@0 1621 VST1.64 {D4, D5, D6, D7}, [r1@128]!
michael@0 1622 VRSHR.S16 Q11,Q11,#4 ; Q11 = y[3]+8>>4
michael@0 1623 VRSHR.S16 Q12,Q12,#4 ; Q12 = y[4]+8>>4
michael@0 1624 VST1.64 {D4, D5, D6, D7}, [r1@128]!
michael@0 1625 VRSHR.S16 Q13,Q13,#4 ; Q13 = y[5]+8>>4
michael@0 1626 VRSHR.S16 Q14,Q14,#4 ; Q14 = y[6]+8>>4
michael@0 1627 VST1.64 {D4, D5, D6, D7}, [r1@128]
michael@0 1628 VRSHR.S16 Q15,Q15,#4 ; Q15 = y[7]+8>>4
michael@0 1629 VSTMIA r0, {D16-D31}
michael@0 1630 MOV PC, r14
michael@0 1631
michael@0 1632 oc_idct8x8_slow_neon_noclear
michael@0 1633 VPOP {D8-D15}
michael@0 1634 VRSHR.S16 Q8, Q8, #4 ; Q8 = y[0]+8>>4
michael@0 1635 VRSHR.S16 Q9, Q9, #4 ; Q9 = y[1]+8>>4
michael@0 1636 VRSHR.S16 Q10,Q10,#4 ; Q10 = y[2]+8>>4
michael@0 1637 VRSHR.S16 Q11,Q11,#4 ; Q11 = y[3]+8>>4
michael@0 1638 VRSHR.S16 Q12,Q12,#4 ; Q12 = y[4]+8>>4
michael@0 1639 VRSHR.S16 Q13,Q13,#4 ; Q13 = y[5]+8>>4
michael@0 1640 VRSHR.S16 Q14,Q14,#4 ; Q14 = y[6]+8>>4
michael@0 1641 VRSHR.S16 Q15,Q15,#4 ; Q15 = y[7]+8>>4
michael@0 1642 VSTMIA r0, {D16-D31}
michael@0 1643 MOV PC, r14
michael@0 1644 ENDP
michael@0 1645
michael@0 1646 oc_idct8x8_stage123_neon PROC
michael@0 1647 ; Stages 1 & 2
michael@0 1648 VMULL.S16 Q4, D18,D1[3]
michael@0 1649 VMULL.S16 Q5, D19,D1[3]
michael@0 1650 VMULL.S16 Q7, D30,D1[3]
michael@0 1651 VMULL.S16 Q6, D31,D1[3]
michael@0 1652 VMULL.S16 Q2, D30,D0[1]
michael@0 1653 VMULL.S16 Q3, D31,D0[1]
michael@0 1654 VSHRN.S32 D8, Q4, #16
michael@0 1655 VSHRN.S32 D9, Q5, #16 ; Q4 = (OC_C7S1*x[1]>>16)
michael@0 1656 VSHRN.S32 D14,Q7, #16
michael@0 1657 VSHRN.S32 D15,Q6, #16 ; Q7 = (OC_C7S1*x[7]>>16)
michael@0 1658 VSHRN.S32 D4, Q2, #16
michael@0 1659 VSHRN.S32 D5, Q3, #16 ; Q2 = (OC_C1S7*x[7]>>16)-x[7]
michael@0 1660 VSUB.S16 Q4, Q4, Q15
michael@0 1661 VADD.S16 Q7, Q7, Q9
michael@0 1662 VSUB.S16 Q4, Q4, Q2 ; Q4 = t[4]
michael@0 1663 VMULL.S16 Q2, D18,D0[1]
michael@0 1664 VMULL.S16 Q9, D19,D0[1]
michael@0 1665 VMULL.S16 Q5, D26,D0[3]
michael@0 1666 VMULL.S16 Q3, D27,D0[3]
michael@0 1667 VMULL.S16 Q6, D22,D0[3]
michael@0 1668 VMULL.S16 Q12,D23,D0[3]
michael@0 1669 VSHRN.S32 D4, Q2, #16
michael@0 1670 VSHRN.S32 D5, Q9, #16 ; Q2 = (OC_C1S7*x[1]>>16)-x[1]
michael@0 1671 VSHRN.S32 D10,Q5, #16
michael@0 1672 VSHRN.S32 D11,Q3, #16 ; Q5 = (OC_C3S5*x[5]>>16)-x[5]
michael@0 1673 VSHRN.S32 D12,Q6, #16
michael@0 1674 VSHRN.S32 D13,Q12,#16 ; Q6 = (OC_C3S5*x[3]>>16)-x[3]
michael@0 1675 VADD.S16 Q7, Q7, Q2 ; Q7 = t[7]
michael@0 1676 VSUB.S16 Q5, Q5, Q11
michael@0 1677 VADD.S16 Q6, Q6, Q11
michael@0 1678 VADD.S16 Q5, Q5, Q13
michael@0 1679 VADD.S16 Q6, Q6, Q13
michael@0 1680 VMULL.S16 Q9, D22,D1[1]
michael@0 1681 VMULL.S16 Q11,D23,D1[1]
michael@0 1682 VMULL.S16 Q15,D26,D1[1]
michael@0 1683 VMULL.S16 Q13,D27,D1[1]
michael@0 1684 VMULL.S16 Q2, D20,D1[2]
michael@0 1685 VMULL.S16 Q12,D21,D1[2]
michael@0 1686 VSHRN.S32 D18,Q9, #16
michael@0 1687 VSHRN.S32 D19,Q11,#16 ; Q9 = (OC_C5S3*x[3]>>16)-x[3]
michael@0 1688 VSHRN.S32 D30,Q15,#16
michael@0 1689 VSHRN.S32 D31,Q13,#16 ; Q15= (OC_C5S3*x[5]>>16)-x[5]
michael@0 1690 VSHRN.S32 D4, Q2, #16
michael@0 1691 VSHRN.S32 D5, Q12,#16 ; Q2 = (OC_C6S2*x[2]>>16)
michael@0 1692 VSUB.S16 Q5, Q5, Q9 ; Q5 = t[5]
michael@0 1693 VADD.S16 Q6, Q6, Q15 ; Q6 = t[6]
michael@0 1694 VSUB.S16 Q2, Q2, Q14
michael@0 1695 VMULL.S16 Q3, D28,D1[2]
michael@0 1696 VMULL.S16 Q11,D29,D1[2]
michael@0 1697 VMULL.S16 Q12,D28,D0[2]
michael@0 1698 VMULL.S16 Q9, D29,D0[2]
michael@0 1699 VMULL.S16 Q13,D20,D0[2]
michael@0 1700 VMULL.S16 Q15,D21,D0[2]
michael@0 1701 VSHRN.S32 D6, Q3, #16
michael@0 1702 VSHRN.S32 D7, Q11,#16 ; Q3 = (OC_C6S2*x[6]>>16)
michael@0 1703 VSHRN.S32 D24,Q12,#16
michael@0 1704 VSHRN.S32 D25,Q9, #16 ; Q12= (OC_C2S6*x[6]>>16)-x[6]
michael@0 1705 VSHRN.S32 D26,Q13,#16
michael@0 1706 VSHRN.S32 D27,Q15,#16 ; Q13= (OC_C2S6*x[2]>>16)-x[2]
michael@0 1707 VSUB.S16 Q9, Q4, Q5 ; Q9 = t[4]-t[5]
michael@0 1708 VSUB.S16 Q11,Q7, Q6 ; Q11= t[7]-t[6]
michael@0 1709 VADD.S16 Q3, Q3, Q10
michael@0 1710 VADD.S16 Q4, Q4, Q5 ; Q4 = t[4]'=t[4]+t[5]
michael@0 1711 VADD.S16 Q7, Q7, Q6 ; Q7 = t[7]'=t[7]+t[6]
michael@0 1712 VSUB.S16 Q2, Q2, Q12 ; Q2 = t[2]
michael@0 1713 VADD.S16 Q3, Q3, Q13 ; Q3 = t[3]
michael@0 1714 VMULL.S16 Q12,D16,D1[0]
michael@0 1715 VMULL.S16 Q13,D17,D1[0]
michael@0 1716 VMULL.S16 Q14,D2, D1[0]
michael@0 1717 VMULL.S16 Q15,D3, D1[0]
michael@0 1718 VMULL.S16 Q5, D18,D1[0]
michael@0 1719 VMULL.S16 Q6, D22,D1[0]
michael@0 1720 VSHRN.S32 D24,Q12,#16
michael@0 1721 VSHRN.S32 D25,Q13,#16
michael@0 1722 VSHRN.S32 D28,Q14,#16
michael@0 1723 VSHRN.S32 D29,Q15,#16
michael@0 1724 VMULL.S16 Q13,D19,D1[0]
michael@0 1725 VMULL.S16 Q15,D23,D1[0]
michael@0 1726 VADD.S16 Q8, Q8, Q12 ; Q8 = t[0]
michael@0 1727 VADD.S16 Q1, Q1, Q14 ; Q1 = t[1]
michael@0 1728 VSHRN.S32 D10,Q5, #16
michael@0 1729 VSHRN.S32 D12,Q6, #16
michael@0 1730 VSHRN.S32 D11,Q13,#16
michael@0 1731 VSHRN.S32 D13,Q15,#16
michael@0 1732 VADD.S16 Q5, Q5, Q9 ; Q5 = t[5]'=OC_C4S4*(t[4]-t[5])>>16
michael@0 1733 VADD.S16 Q6, Q6, Q11 ; Q6 = t[6]'=OC_C4S4*(t[7]-t[6])>>16
michael@0 1734 ; Stage 3
michael@0 1735 VSUB.S16 Q11,Q8, Q3 ; Q11 = t[3]''=t[0]-t[3]
michael@0 1736 VADD.S16 Q8, Q8, Q3 ; Q8 = t[0]''=t[0]+t[3]
michael@0 1737 VADD.S16 Q9, Q1, Q2 ; Q9 = t[1]''=t[1]+t[2]
michael@0 1738 VADD.S16 Q3, Q6, Q5 ; Q3 = t[6]''=t[6]'+t[5]'
michael@0 1739 VSUB.S16 Q10,Q1, Q2 ; Q10 = t[2]''=t[1]-t[2]
michael@0 1740 VSUB.S16 Q5, Q6, Q5 ; Q5 = t[5]''=t[6]'-t[5]'
michael@0 1741 MOV PC, r14
michael@0 1742 ENDP
michael@0 1743
michael@0 1744 oc_idct8x8_10_neon PROC
michael@0 1745 ADR r3, OC_IDCT_CONSTS_NEON
michael@0 1746 VLD1.64 {D0,D1}, [r3@128]
michael@0 1747 MOV r2, r1
michael@0 1748 ; Row transforms (input is pre-transposed)
michael@0 1749 ; Stage 1
michael@0 1750 VLD1.64 {D16,D17,D18,D19},[r2@128]!
michael@0 1751 MOV r12, #16
michael@0 1752 VMULL.S16 Q15,D16,D1[0] ; Q15= OC_C4S4*x[0]-(x[0]<<16)
michael@0 1753 VLD1.64 {D17}, [r2@64], r12
michael@0 1754 VMULL.S16 Q2, D18,D0[1] ; Q2 = OC_C1S7*x[1]-(x[1]<<16)
michael@0 1755 VLD1.64 {D19}, [r2@64]
michael@0 1756 VMULL.S16 Q14,D17,D0[2] ; Q14= OC_C2S6*x[2]-(x[2]<<16)
michael@0 1757 VMULL.S16 Q3, D19,D0[3] ; Q3 = OC_C3S5*x[3]-(x[3]<<16)
michael@0 1758 VMULL.S16 Q13,D19,D1[1] ; Q13= OC_C5S3*x[3]-(x[3]<<16)
michael@0 1759 VMULL.S16 Q12,D18,D1[3] ; Q12= OC_C7S1*x[1]
michael@0 1760 VMULL.S16 Q1, D17,D1[2] ; Q1 = OC_C6S2*x[2]
michael@0 1761 VSHRN.S32 D30,Q15,#16 ; D30= t[0]-x[0]
michael@0 1762 VSHRN.S32 D4, Q2, #16 ; D4 = t[7]-x[1]
michael@0 1763 VSHRN.S32 D31,Q14,#16 ; D31= t[3]-x[2]
michael@0 1764 VSHRN.S32 D6, Q3, #16 ; D6 = t[6]-x[3]
michael@0 1765 VSHRN.S32 D7, Q13,#16 ; D7 = -t[5]-x[3]
michael@0 1766 VSHRN.S32 D5, Q12,#16 ; D5 = t[4]
michael@0 1767 VSHRN.S32 D2, Q1, #16 ; D2 = t[2]
michael@0 1768 VADD.S16 D4, D4, D18 ; D4 = t[7]
michael@0 1769 VADD.S16 D6, D6, D19 ; D6 = t[6]
michael@0 1770 VADD.S16 D7, D7, D19 ; D7 = -t[5]
michael@0 1771 VADD.S16 Q15,Q15,Q8 ; D30= t[0]
michael@0 1772 ; D31= t[3]
michael@0 1773 ; Stages 2 & 3
michael@0 1774 VSUB.S16 Q12,Q2, Q3 ; D24= t[7]-t[6]
michael@0 1775 ; D25= t[4]'=t[4]+t[5]
michael@0 1776 VADD.S16 Q13,Q2, Q3 ; D26= t[7]'=t[7]+t[6]
michael@0 1777 ; D27= t[4]-t[5]
michael@0 1778 VMULL.S16 Q11,D24,D1[0] ; Q11= OC_C4S4*(t[7]-t[6])
michael@0 1779 ; -(t[7]-t[6]<<16)
michael@0 1780 VMULL.S16 Q14,D27,D1[0] ; Q14= OC_C4S4*(t[4]-t[5])
michael@0 1781 ; -(t[4]-t[5]<<16)
michael@0 1782 VADD.S16 D16,D30,D31 ; D16= t[0]'=t[0]+t[3]
michael@0 1783 VSUB.S16 D17,D30,D2 ; D17= t[2]'=t[0]-t[2]
michael@0 1784 VADD.S16 D18,D30,D2 ; D18= t[1]'=t[0]+t[2]
michael@0 1785 VSHRN.S32 D22,Q11,#16 ; D22= (OC_C4S4*(t[7]-t[6])>>16)
michael@0 1786 ; -(t[7]-t[6])
michael@0 1787 VSHRN.S32 D23,Q14,#16 ; D23= (OC_C4S4*(t[4]-t[5])>>16)
michael@0 1788 ; -(t[4]-t[5])
michael@0 1789 VSUB.S16 D19,D30,D31 ; D19= t[3]'=t[0]-t[3]
michael@0 1790 VADD.S16 D22,D22,D24 ; D22= t[6]'=OC_C4S4*(t[7]-t[6])>>16
michael@0 1791 VADD.S16 D23,D23,D27 ; D23= t[5]'=OC_C4S4*(t[4]-t[5])>>16
michael@0 1792 VSUB.S16 D27,D22,D23 ; D27= t[5]''=t[6]'-t[5]'
michael@0 1793 VADD.S16 D24,D22,D23 ; D24= t[6]''=t[6]'+t[5]'
michael@0 1794 ; Stage 4
michael@0 1795 VSUB.S16 Q11,Q8, Q13 ; D22= y[7]=t[0]'-t[7]'
michael@0 1796 ; D23= y[5]=t[2]'-t[5]''
michael@0 1797 VSUB.S16 Q10,Q9, Q12 ; D20= y[6]=t[1]'-t[6]'
michael@0 1798 ; D21= y[4]=t[3]'-t[4]''
michael@0 1799 VADD.S16 Q8, Q8, Q13 ; D16= y[0]=t[0]'+t[7]'
michael@0 1800 ; D17= y[2]=t[2]'+t[5]''
michael@0 1801 VADD.S16 Q9, Q9, Q12 ; D18= y[1]=t[1]'-t[6]'
michael@0 1802 ; D19= y[3]=t[3]'-t[4]''
michael@0 1803 ; 8x4 transpose
michael@0 1804 VTRN.16 Q10,Q11 ; Q10= c5c4a5a4 c7c6a7a6
michael@0 1805 ; Q11= d5d4b5b4 d7d6b7b6
michael@0 1806 VTRN.16 Q8, Q9 ; Q8 = c3c2a3a2 c1c0a1a0
michael@0 1807 ; Q9 = d3d2b3b2 d1d0b1b0
michael@0 1808 VSWP D20,D21 ; Q10= c7c6a7a6 c5c4a5a4
michael@0 1809 VSWP D22,D23 ; Q11= d7d6b7b6 d5d4b5b4
michael@0 1810 VUZP.32 Q9, Q11 ; Q9 = b7b6b5b4 b3b2b1b0
michael@0 1811 ; Q11= d7d6d5d4 d3d2d1d0
michael@0 1812 VMULL.S16 Q15,D18,D0[1]
michael@0 1813 VMULL.S16 Q13,D22,D1[1]
michael@0 1814 VUZP.32 Q8, Q10 ; Q8 = a7a6a5a4 a3a2a1a0
michael@0 1815 ; Q10= c7c6c5c4 c3c2c1c0
michael@0 1816 ; Column transforms
michael@0 1817 ; Stages 1, 2, & 3
michael@0 1818 VMULL.S16 Q14,D19,D0[1] ; Q14:Q15= OC_C1S7*x[1]-(x[1]<<16)
michael@0 1819 VMULL.S16 Q12,D23,D1[1] ; Q12:Q13= OC_C5S3*x[3]-(x[3]<<16)
michael@0 1820 VMULL.S16 Q3, D22,D0[3]
michael@0 1821 VMULL.S16 Q2, D23,D0[3] ; Q2:Q3 = OC_C3S5*x[3]-(x[3]<<16)
michael@0 1822 VSHRN.S32 D30,Q15,#16
michael@0 1823 VSHRN.S32 D31,Q14,#16 ; Q15= (OC_C1S7*x[1]>>16)-x[1]
michael@0 1824 VSHRN.S32 D26,Q13,#16
michael@0 1825 VSHRN.S32 D27,Q12,#16 ; Q13= (OC_C5S3*x[3]>>16)-x[3]
michael@0 1826 VSHRN.S32 D28,Q3, #16
michael@0 1827 VSHRN.S32 D29,Q2, #16 ; Q14= (OC_C3S5*x[3]>>16)-x[3]
michael@0 1828 VADD.S16 Q15,Q15,Q9 ; Q15= t[7]
michael@0 1829 VADD.S16 Q13,Q13,Q11 ; Q13= -t[5]
michael@0 1830 VADD.S16 Q14,Q14,Q11 ; Q14= t[6]
michael@0 1831 VMULL.S16 Q12,D18,D1[3]
michael@0 1832 VMULL.S16 Q2, D19,D1[3] ; Q2:Q12= OC_C7S1*x[1]
michael@0 1833 VMULL.S16 Q1, D16,D1[0]
michael@0 1834 VMULL.S16 Q11,D17,D1[0] ; Q11:Q1 = OC_C4S4*x[0]-(x[0]<<16)
michael@0 1835 VMULL.S16 Q3, D20,D0[2]
michael@0 1836 VMULL.S16 Q9, D21,D0[2] ; Q9:Q3 = OC_C2S6*x[2]-(x[2]<<16)
michael@0 1837 VSHRN.S32 D24,Q12,#16
michael@0 1838 VSHRN.S32 D25,Q2, #16 ; Q12= t[4]
michael@0 1839 VMULL.S16 Q2, D20,D1[2]
michael@0 1840 VSHRN.S32 D2, Q1, #16
michael@0 1841 VSHRN.S32 D3, Q11,#16 ; Q1 = (OC_C4S4*x[0]>>16)-x[0]
michael@0 1842 VMULL.S16 Q11,D21,D1[2] ; Q2:Q11= OC_C6S2*x[2]
michael@0 1843 VSHRN.S32 D6, Q3, #16
michael@0 1844 VSHRN.S32 D7, Q9, #16 ; Q3 = (OC_C2S6*x[2]>>16)-x[2]
michael@0 1845 VSUB.S16 Q9, Q15,Q14 ; Q9 = t[7]-t[6]
michael@0 1846 VADD.S16 Q15,Q15,Q14 ; Q15= t[7]'=t[7]+t[6]
michael@0 1847 VSHRN.S32 D4, Q2, #16
michael@0 1848 VSHRN.S32 D5, Q11,#16 ; Q2 = t[2]
michael@0 1849 VADD.S16 Q1, Q1, Q8 ; Q1 = t[0]
michael@0 1850 VADD.S16 Q8, Q12,Q13 ; Q8 = t[4]-t[5]
michael@0 1851 VADD.S16 Q3, Q3, Q10 ; Q3 = t[3]
michael@0 1852 VMULL.S16 Q10,D16,D1[0]
michael@0 1853 VMULL.S16 Q11,D17,D1[0] ; Q11:Q10= OC_C4S4*(t[4]-t[5])
michael@0 1854 ; -(t[4]-t[5]<<16)
michael@0 1855 VSUB.S16 Q12,Q12,Q13 ; Q12= t[4]'=t[4]+t[5]
michael@0 1856 VMULL.S16 Q14,D18,D1[0]
michael@0 1857 VMULL.S16 Q13,D19,D1[0] ; Q13:Q14= OC_C4S4*(t[6]-t[7])
michael@0 1858 ; -(t[6]-t[7]<<16)
michael@0 1859 VSHRN.S32 D20,Q10,#16
michael@0 1860 VSHRN.S32 D21,Q11,#16 ; Q10= (OC_C4S4*(t[4]-t[5])>>16)
michael@0 1861 ; -(t[4]-t[5])
michael@0 1862 VADD.S16 Q11,Q1, Q3 ; Q11= t[0]'=t[0]+t[3]
michael@0 1863 VSUB.S16 Q3, Q1, Q3 ; Q3 = t[3]'=t[0]-t[3]
michael@0 1864 VSHRN.S32 D28,Q14,#16
michael@0 1865 VSHRN.S32 D29,Q13,#16 ; Q14= (OC_C4S4*(t[7]-t[6])>>16)
michael@0 1866 ; -(t[7]-t[6])
michael@0 1867 VADD.S16 Q10,Q10,Q8 ; Q10=t[5]'
michael@0 1868 VADD.S16 Q14,Q14,Q9 ; Q14=t[6]'
michael@0 1869 VSUB.S16 Q13,Q14,Q10 ; Q13=t[5]''=t[6]'-t[5]'
michael@0 1870 VADD.S16 Q14,Q14,Q10 ; Q14=t[6]''=t[6]'+t[5]'
michael@0 1871 VADD.S16 Q10,Q1, Q2 ; Q10= t[1]'=t[0]+t[2]
michael@0 1872 VSUB.S16 Q2, Q1, Q2 ; Q2 = t[2]'=t[0]-t[2]
michael@0 1873 ; Stage 4
michael@0 1874 CMP r0, r1
michael@0 1875 VADD.S16 Q8, Q11,Q15 ; Q8 = y[0]=t[0]'+t[7]'
michael@0 1876 VADD.S16 Q9, Q10,Q14 ; Q9 = y[1]=t[1]'+t[6]''
michael@0 1877 VSUB.S16 Q15,Q11,Q15 ; Q15 = y[7]=t[0]'-t[7]'
michael@0 1878 VSUB.S16 Q14,Q10,Q14 ; Q14 = y[6]=t[1]'-t[6]''
michael@0 1879 VADD.S16 Q10,Q2, Q13 ; Q10 = y[2]=t[2]'+t[5]''
michael@0 1880 VADD.S16 Q11,Q3, Q12 ; Q11 = y[3]=t[3]'+t[4]'
michael@0 1881 VSUB.S16 Q12,Q3, Q12 ; Q12 = y[4]=t[3]'-t[4]'
michael@0 1882 VSUB.S16 Q13,Q2, Q13 ; Q13 = y[5]=t[2]'-t[5]''
michael@0 1883 BEQ oc_idct8x8_10_neon_noclear
michael@0 1884 VMOV.I8 D2, #0
michael@0 1885 VRSHR.S16 Q8, Q8, #4 ; Q8 = y[0]+8>>4
michael@0 1886 VST1.64 {D2}, [r1@64], r12
michael@0 1887 VRSHR.S16 Q9, Q9, #4 ; Q9 = y[1]+8>>4
michael@0 1888 VRSHR.S16 Q10,Q10,#4 ; Q10 = y[2]+8>>4
michael@0 1889 VST1.64 {D2}, [r1@64], r12
michael@0 1890 VRSHR.S16 Q11,Q11,#4 ; Q11 = y[3]+8>>4
michael@0 1891 VRSHR.S16 Q12,Q12,#4 ; Q12 = y[4]+8>>4
michael@0 1892 VST1.64 {D2}, [r1@64], r12
michael@0 1893 VRSHR.S16 Q13,Q13,#4 ; Q13 = y[5]+8>>4
michael@0 1894 VRSHR.S16 Q14,Q14,#4 ; Q14 = y[6]+8>>4
michael@0 1895 VST1.64 {D2}, [r1@64]
michael@0 1896 VRSHR.S16 Q15,Q15,#4 ; Q15 = y[7]+8>>4
michael@0 1897 VSTMIA r0, {D16-D31}
michael@0 1898 MOV PC, r14
michael@0 1899
michael@0 1900 oc_idct8x8_10_neon_noclear
michael@0 1901 VRSHR.S16 Q8, Q8, #4 ; Q8 = y[0]+8>>4
michael@0 1902 VRSHR.S16 Q9, Q9, #4 ; Q9 = y[1]+8>>4
michael@0 1903 VRSHR.S16 Q10,Q10,#4 ; Q10 = y[2]+8>>4
michael@0 1904 VRSHR.S16 Q11,Q11,#4 ; Q11 = y[3]+8>>4
michael@0 1905 VRSHR.S16 Q12,Q12,#4 ; Q12 = y[4]+8>>4
michael@0 1906 VRSHR.S16 Q13,Q13,#4 ; Q13 = y[5]+8>>4
michael@0 1907 VRSHR.S16 Q14,Q14,#4 ; Q14 = y[6]+8>>4
michael@0 1908 VRSHR.S16 Q15,Q15,#4 ; Q15 = y[7]+8>>4
michael@0 1909 VSTMIA r0, {D16-D31}
michael@0 1910 MOV PC, r14
michael@0 1911 ENDP
michael@0 1912 ]
michael@0 1913
michael@0 1914 END

mercurial