Wed, 31 Dec 2014 06:09:35 +0100
Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.
michael@0 | 1 | ;******************************************************************** |
michael@0 | 2 | ;* * |
michael@0 | 3 | ;* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * |
michael@0 | 4 | ;* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * |
michael@0 | 5 | ;* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * |
michael@0 | 6 | ;* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * |
michael@0 | 7 | ;* * |
michael@0 | 8 | ;* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010 * |
michael@0 | 9 | ;* by the Xiph.Org Foundation and contributors http://www.xiph.org/ * |
michael@0 | 10 | ;* * |
michael@0 | 11 | ;******************************************************************** |
michael@0 | 12 | ; Original implementation: |
michael@0 | 13 | ; Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd |
michael@0 | 14 | ; last mod: $Id: armidct.s 17481 2010-10-03 22:49:42Z tterribe $ |
michael@0 | 15 | ;******************************************************************** |
michael@0 | 16 | |
michael@0 | 17 | AREA |.text|, CODE, READONLY |
michael@0 | 18 | |
michael@0 | 19 | ; Explicitly specifying alignment here because some versions of |
michael@0 | 20 | ; gas don't align code correctly. See |
michael@0 | 21 | ; http://lists.gnu.org/archive/html/bug-binutils/2011-06/msg00199.html |
michael@0 | 22 | ; https://bugzilla.mozilla.org/show_bug.cgi?id=920992 |
michael@0 | 23 | ALIGN |
michael@0 | 24 | |
michael@0 | 25 | GET armopts.s |
michael@0 | 26 | |
michael@0 | 27 | EXPORT oc_idct8x8_1_arm |
michael@0 | 28 | EXPORT oc_idct8x8_arm |
michael@0 | 29 | |
michael@0 | 30 | oc_idct8x8_1_arm PROC |
michael@0 | 31 | ; r0 = ogg_int16_t *_y |
michael@0 | 32 | ; r1 = ogg_uint16_t _dc |
michael@0 | 33 | ORR r1, r1, r1, LSL #16 |
michael@0 | 34 | MOV r2, r1 |
michael@0 | 35 | MOV r3, r1 |
michael@0 | 36 | MOV r12,r1 |
michael@0 | 37 | STMIA r0!,{r1,r2,r3,r12} |
michael@0 | 38 | STMIA r0!,{r1,r2,r3,r12} |
michael@0 | 39 | STMIA r0!,{r1,r2,r3,r12} |
michael@0 | 40 | STMIA r0!,{r1,r2,r3,r12} |
michael@0 | 41 | STMIA r0!,{r1,r2,r3,r12} |
michael@0 | 42 | STMIA r0!,{r1,r2,r3,r12} |
michael@0 | 43 | STMIA r0!,{r1,r2,r3,r12} |
michael@0 | 44 | STMIA r0!,{r1,r2,r3,r12} |
michael@0 | 45 | MOV PC, r14 |
michael@0 | 46 | ENDP |
michael@0 | 47 | |
michael@0 | 48 | oc_idct8x8_arm PROC |
michael@0 | 49 | ; r0 = ogg_int16_t *_y |
michael@0 | 50 | ; r1 = ogg_int16_t *_x |
michael@0 | 51 | ; r2 = int _last_zzi |
michael@0 | 52 | CMP r2, #3 |
michael@0 | 53 | BLE oc_idct8x8_3_arm |
michael@0 | 54 | CMP r2, #6 |
michael@0 | 55 | BLE oc_idct8x8_6_arm |
michael@0 | 56 | CMP r2, #10 |
michael@0 | 57 | BLE oc_idct8x8_10_arm |
michael@0 | 58 | oc_idct8x8_slow_arm |
michael@0 | 59 | STMFD r13!,{r4-r11,r14} |
michael@0 | 60 | SUB r13,r13,#64*2 |
michael@0 | 61 | ; Row transforms |
michael@0 | 62 | STR r0, [r13,#-4]! |
michael@0 | 63 | ADD r0, r13, #4 ; Write to temp storage. |
michael@0 | 64 | BL idct8core_arm |
michael@0 | 65 | BL idct8core_arm |
michael@0 | 66 | BL idct8core_arm |
michael@0 | 67 | BL idct8core_arm |
michael@0 | 68 | BL idct8core_arm |
michael@0 | 69 | BL idct8core_arm |
michael@0 | 70 | BL idct8core_arm |
michael@0 | 71 | BL idct8core_arm |
michael@0 | 72 | LDR r0, [r13], #4 ; Write to the final destination. |
michael@0 | 73 | ; Clear input data for next block (decoder only). |
michael@0 | 74 | SUB r2, r1, #8*16 |
michael@0 | 75 | CMP r0, r2 |
michael@0 | 76 | MOV r1, r13 ; And read from temp storage. |
michael@0 | 77 | BEQ oc_idct8x8_slow_arm_cols |
michael@0 | 78 | MOV r4, #0 |
michael@0 | 79 | MOV r5, #0 |
michael@0 | 80 | MOV r6, #0 |
michael@0 | 81 | MOV r7, #0 |
michael@0 | 82 | STMIA r2!,{r4,r5,r6,r7} |
michael@0 | 83 | STMIA r2!,{r4,r5,r6,r7} |
michael@0 | 84 | STMIA r2!,{r4,r5,r6,r7} |
michael@0 | 85 | STMIA r2!,{r4,r5,r6,r7} |
michael@0 | 86 | STMIA r2!,{r4,r5,r6,r7} |
michael@0 | 87 | STMIA r2!,{r4,r5,r6,r7} |
michael@0 | 88 | STMIA r2!,{r4,r5,r6,r7} |
michael@0 | 89 | STMIA r2!,{r4,r5,r6,r7} |
michael@0 | 90 | oc_idct8x8_slow_arm_cols |
michael@0 | 91 | ; Column transforms |
michael@0 | 92 | BL idct8core_down_arm |
michael@0 | 93 | BL idct8core_down_arm |
michael@0 | 94 | BL idct8core_down_arm |
michael@0 | 95 | BL idct8core_down_arm |
michael@0 | 96 | BL idct8core_down_arm |
michael@0 | 97 | BL idct8core_down_arm |
michael@0 | 98 | BL idct8core_down_arm |
michael@0 | 99 | BL idct8core_down_arm |
michael@0 | 100 | ADD r13,r13,#64*2 |
michael@0 | 101 | LDMFD r13!,{r4-r11,PC} |
michael@0 | 102 | ENDP |
michael@0 | 103 | |
michael@0 | 104 | oc_idct8x8_10_arm PROC |
michael@0 | 105 | STMFD r13!,{r4-r11,r14} |
michael@0 | 106 | SUB r13,r13,#64*2 |
michael@0 | 107 | ; Row transforms |
michael@0 | 108 | MOV r2, r0 |
michael@0 | 109 | MOV r0, r13 ; Write to temp storage. |
michael@0 | 110 | BL idct4core_arm |
michael@0 | 111 | BL idct3core_arm |
michael@0 | 112 | BL idct2core_arm |
michael@0 | 113 | BL idct1core_arm |
michael@0 | 114 | ; Clear input data for next block (decoder only). |
michael@0 | 115 | SUB r0, r1, #4*16 |
michael@0 | 116 | CMP r0, r2 |
michael@0 | 117 | MOV r1, r13 ; Read from temp storage. |
michael@0 | 118 | BEQ oc_idct8x8_10_arm_cols |
michael@0 | 119 | MOV r4, #0 |
michael@0 | 120 | STR r4, [r0] |
michael@0 | 121 | STR r4, [r0,#4] |
michael@0 | 122 | STR r4, [r0,#16] |
michael@0 | 123 | STR r4, [r0,#20] |
michael@0 | 124 | STR r4, [r0,#32] |
michael@0 | 125 | STR r4, [r0,#48] |
michael@0 | 126 | MOV r0, r2 ; Write to the final destination |
michael@0 | 127 | oc_idct8x8_10_arm_cols |
michael@0 | 128 | ; Column transforms |
michael@0 | 129 | BL idct4core_down_arm |
michael@0 | 130 | BL idct4core_down_arm |
michael@0 | 131 | BL idct4core_down_arm |
michael@0 | 132 | BL idct4core_down_arm |
michael@0 | 133 | BL idct4core_down_arm |
michael@0 | 134 | BL idct4core_down_arm |
michael@0 | 135 | BL idct4core_down_arm |
michael@0 | 136 | BL idct4core_down_arm |
michael@0 | 137 | ADD r13,r13,#64*2 |
michael@0 | 138 | LDMFD r13!,{r4-r11,PC} |
michael@0 | 139 | ENDP |
michael@0 | 140 | |
michael@0 | 141 | oc_idct8x8_6_arm PROC |
michael@0 | 142 | STMFD r13!,{r4-r7,r9-r11,r14} |
michael@0 | 143 | SUB r13,r13,#64*2 |
michael@0 | 144 | ; Row transforms |
michael@0 | 145 | MOV r2, r0 |
michael@0 | 146 | MOV r0, r13 ; Write to temp storage. |
michael@0 | 147 | BL idct3core_arm |
michael@0 | 148 | BL idct2core_arm |
michael@0 | 149 | BL idct1core_arm |
michael@0 | 150 | ; Clear input data for next block (decoder only). |
michael@0 | 151 | SUB r0, r1, #3*16 |
michael@0 | 152 | CMP r0, r2 |
michael@0 | 153 | MOV r1, r13 ; Read from temp storage. |
michael@0 | 154 | BEQ oc_idct8x8_6_arm_cols |
michael@0 | 155 | MOV r4, #0 |
michael@0 | 156 | STR r4, [r0] |
michael@0 | 157 | STR r4, [r0,#4] |
michael@0 | 158 | STR r4, [r0,#16] |
michael@0 | 159 | STR r4, [r0,#32] |
michael@0 | 160 | MOV r0, r2 ; Write to the final destination |
michael@0 | 161 | oc_idct8x8_6_arm_cols |
michael@0 | 162 | ; Column transforms |
michael@0 | 163 | BL idct3core_down_arm |
michael@0 | 164 | BL idct3core_down_arm |
michael@0 | 165 | BL idct3core_down_arm |
michael@0 | 166 | BL idct3core_down_arm |
michael@0 | 167 | BL idct3core_down_arm |
michael@0 | 168 | BL idct3core_down_arm |
michael@0 | 169 | BL idct3core_down_arm |
michael@0 | 170 | BL idct3core_down_arm |
michael@0 | 171 | ADD r13,r13,#64*2 |
michael@0 | 172 | LDMFD r13!,{r4-r7,r9-r11,PC} |
michael@0 | 173 | ENDP |
michael@0 | 174 | |
michael@0 | 175 | oc_idct8x8_3_arm PROC |
michael@0 | 176 | STMFD r13!,{r4-r7,r9-r11,r14} |
michael@0 | 177 | SUB r13,r13,#64*2 |
michael@0 | 178 | ; Row transforms |
michael@0 | 179 | MOV r2, r0 |
michael@0 | 180 | MOV r0, r13 ; Write to temp storage. |
michael@0 | 181 | BL idct2core_arm |
michael@0 | 182 | BL idct1core_arm |
michael@0 | 183 | ; Clear input data for next block (decoder only). |
michael@0 | 184 | SUB r0, r1, #2*16 |
michael@0 | 185 | CMP r0, r2 |
michael@0 | 186 | MOV r1, r13 ; Read from temp storage. |
michael@0 | 187 | MOVNE r4, #0 |
michael@0 | 188 | STRNE r4, [r0] |
michael@0 | 189 | STRNE r4, [r0,#16] |
michael@0 | 190 | MOVNE r0, r2 ; Write to the final destination |
michael@0 | 191 | ; Column transforms |
michael@0 | 192 | BL idct2core_down_arm |
michael@0 | 193 | BL idct2core_down_arm |
michael@0 | 194 | BL idct2core_down_arm |
michael@0 | 195 | BL idct2core_down_arm |
michael@0 | 196 | BL idct2core_down_arm |
michael@0 | 197 | BL idct2core_down_arm |
michael@0 | 198 | BL idct2core_down_arm |
michael@0 | 199 | BL idct2core_down_arm |
michael@0 | 200 | ADD r13,r13,#64*2 |
michael@0 | 201 | LDMFD r13!,{r4-r7,r9-r11,PC} |
michael@0 | 202 | ENDP |
michael@0 | 203 | |
michael@0 | 204 | idct1core_arm PROC |
michael@0 | 205 | ; r0 = ogg_int16_t *_y (destination) |
michael@0 | 206 | ; r1 = const ogg_int16_t *_x (source) |
michael@0 | 207 | LDRSH r3, [r1], #16 |
michael@0 | 208 | MOV r12,#0x05 |
michael@0 | 209 | ORR r12,r12,#0xB500 |
michael@0 | 210 | MUL r3, r12, r3 |
michael@0 | 211 | ; Stall ? |
michael@0 | 212 | MOV r3, r3, ASR #16 |
michael@0 | 213 | STRH r3, [r0], #2 |
michael@0 | 214 | STRH r3, [r0, #14] |
michael@0 | 215 | STRH r3, [r0, #30] |
michael@0 | 216 | STRH r3, [r0, #46] |
michael@0 | 217 | STRH r3, [r0, #62] |
michael@0 | 218 | STRH r3, [r0, #78] |
michael@0 | 219 | STRH r3, [r0, #94] |
michael@0 | 220 | STRH r3, [r0, #110] |
michael@0 | 221 | MOV PC,R14 |
michael@0 | 222 | ENDP |
michael@0 | 223 | |
michael@0 | 224 | idct2core_arm PROC |
michael@0 | 225 | ; r0 = ogg_int16_t *_y (destination) |
michael@0 | 226 | ; r1 = const ogg_int16_t *_x (source) |
michael@0 | 227 | LDRSH r9, [r1], #16 ; r9 = x[0] |
michael@0 | 228 | LDR r12,OC_C4S4 |
michael@0 | 229 | LDRSH r11,[r1, #-14] ; r11= x[1] |
michael@0 | 230 | LDR r3, OC_C7S1 |
michael@0 | 231 | MUL r9, r12,r9 ; r9 = t[0]<<16 = OC_C4S4*x[0] |
michael@0 | 232 | LDR r10,OC_C1S7 |
michael@0 | 233 | MUL r3, r11,r3 ; r3 = t[4]<<16 = OC_C7S1*x[1] |
michael@0 | 234 | MOV r9, r9, ASR #16 ; r9 = t[0] |
michael@0 | 235 | MUL r11,r10,r11 ; r11= t[7]<<16 = OC_C1S7*x[1] |
michael@0 | 236 | MOV r3, r3, ASR #16 ; r3 = t[4] |
michael@0 | 237 | MUL r10,r12,r3 ; r10= t[5]<<16 = OC_C4S4*t[4] |
michael@0 | 238 | MOV r11,r11,ASR #16 ; r11= t[7] |
michael@0 | 239 | MUL r12,r11,r12 ; r12= t[6]<<16 = OC_C4S4*t[7] |
michael@0 | 240 | MOV r10,r10,ASR #16 ; r10= t[5] |
michael@0 | 241 | ADD r12,r9,r12,ASR #16 ; r12= t[0]+t[6] |
michael@0 | 242 | ADD r12,r12,r10 ; r12= t[0]+t2[6] = t[0]+t[6]+t[5] |
michael@0 | 243 | SUB r10,r12,r10,LSL #1 ; r10= t[0]+t2[5] = t[0]+t[6]-t[5] |
michael@0 | 244 | ADD r3, r3, r9 ; r3 = t[0]+t[4] |
michael@0 | 245 | ADD r11,r11,r9 ; r11= t[0]+t[7] |
michael@0 | 246 | STRH r11,[r0], #2 ; y[0] = t[0]+t[7] |
michael@0 | 247 | STRH r12,[r0, #14] ; y[1] = t[0]+t[6] |
michael@0 | 248 | STRH r10,[r0, #30] ; y[2] = t[0]+t[5] |
michael@0 | 249 | STRH r3, [r0, #46] ; y[3] = t[0]+t[4] |
michael@0 | 250 | RSB r3, r3, r9, LSL #1 ; r3 = t[0]*2-(t[0]+t[4])=t[0]-t[4] |
michael@0 | 251 | RSB r10,r10,r9, LSL #1 ; r10= t[0]*2-(t[0]+t[5])=t[0]-t[5] |
michael@0 | 252 | RSB r12,r12,r9, LSL #1 ; r12= t[0]*2-(t[0]+t[6])=t[0]-t[6] |
michael@0 | 253 | RSB r11,r11,r9, LSL #1 ; r1 = t[0]*2-(t[0]+t[7])=t[0]-t[7] |
michael@0 | 254 | STRH r3, [r0, #62] ; y[4] = t[0]-t[4] |
michael@0 | 255 | STRH r10,[r0, #78] ; y[5] = t[0]-t[5] |
michael@0 | 256 | STRH r12,[r0, #94] ; y[6] = t[0]-t[6] |
michael@0 | 257 | STRH r11,[r0, #110] ; y[7] = t[0]-t[7] |
michael@0 | 258 | MOV PC,r14 |
michael@0 | 259 | ENDP |
michael@0 | 260 | |
michael@0 | 261 | idct2core_down_arm PROC |
michael@0 | 262 | ; r0 = ogg_int16_t *_y (destination) |
michael@0 | 263 | ; r1 = const ogg_int16_t *_x (source) |
michael@0 | 264 | LDRSH r9, [r1], #16 ; r9 = x[0] |
michael@0 | 265 | LDR r12,OC_C4S4 |
michael@0 | 266 | LDRSH r11,[r1, #-14] ; r11= x[1] |
michael@0 | 267 | LDR r3, OC_C7S1 |
michael@0 | 268 | MUL r9, r12,r9 ; r9 = t[0]<<16 = OC_C4S4*x[0] |
michael@0 | 269 | LDR r10,OC_C1S7 |
michael@0 | 270 | MUL r3, r11,r3 ; r3 = t[4]<<16 = OC_C7S1*x[1] |
michael@0 | 271 | MOV r9, r9, ASR #16 ; r9 = t[0] |
michael@0 | 272 | MUL r11,r10,r11 ; r11= t[7]<<16 = OC_C1S7*x[1] |
michael@0 | 273 | ADD r9, r9, #8 ; r9 = t[0]+8 |
michael@0 | 274 | MOV r3, r3, ASR #16 ; r3 = t[4] |
michael@0 | 275 | MUL r10,r12,r3 ; r10= t[5]<<16 = OC_C4S4*t[4] |
michael@0 | 276 | MOV r11,r11,ASR #16 ; r11= t[7] |
michael@0 | 277 | MUL r12,r11,r12 ; r12= t[6]<<16 = OC_C4S4*t[7] |
michael@0 | 278 | MOV r10,r10,ASR #16 ; r10= t[5] |
michael@0 | 279 | ADD r12,r9,r12,ASR #16 ; r12= t[0]+t[6]+8 |
michael@0 | 280 | ADD r12,r12,r10 ; r12= t[0]+t2[6] = t[0]+t[6]+t[5]+8 |
michael@0 | 281 | SUB r10,r12,r10,LSL #1 ; r10= t[0]+t2[5] = t[0]+t[6]-t[5]+8 |
michael@0 | 282 | ADD r3, r3, r9 ; r3 = t[0]+t[4]+8 |
michael@0 | 283 | ADD r11,r11,r9 ; r11= t[0]+t[7]+8 |
michael@0 | 284 | ; TODO: This is wrong. |
michael@0 | 285 | ; The C code truncates to 16 bits by storing to RAM and doing the |
michael@0 | 286 | ; shifts later; we've got an extra 4 bits here. |
michael@0 | 287 | MOV r4, r11,ASR #4 |
michael@0 | 288 | MOV r5, r12,ASR #4 |
michael@0 | 289 | MOV r6, r10,ASR #4 |
michael@0 | 290 | MOV r7, r3, ASR #4 |
michael@0 | 291 | RSB r3, r3, r9, LSL #1 ;r3 =t[0]*2+8-(t[0]+t[4])=t[0]-t[4]+8 |
michael@0 | 292 | RSB r10,r10,r9, LSL #1 ;r10=t[0]*2+8-(t[0]+t[5])=t[0]-t[5]+8 |
michael@0 | 293 | RSB r12,r12,r9, LSL #1 ;r12=t[0]*2+8-(t[0]+t[6])=t[0]-t[6]+8 |
michael@0 | 294 | RSB r11,r11,r9, LSL #1 ;r11=t[0]*2+8-(t[0]+t[7])=t[0]-t[7]+8 |
michael@0 | 295 | MOV r3, r3, ASR #4 |
michael@0 | 296 | MOV r10,r10,ASR #4 |
michael@0 | 297 | MOV r12,r12,ASR #4 |
michael@0 | 298 | MOV r11,r11,ASR #4 |
michael@0 | 299 | STRH r4, [r0], #2 ; y[0] = t[0]+t[7] |
michael@0 | 300 | STRH r5, [r0, #14] ; y[1] = t[0]+t[6] |
michael@0 | 301 | STRH r6, [r0, #30] ; y[2] = t[0]+t[5] |
michael@0 | 302 | STRH r7, [r0, #46] ; y[3] = t[0]+t[4] |
michael@0 | 303 | STRH r3, [r0, #62] ; y[4] = t[0]-t[4] |
michael@0 | 304 | STRH r10,[r0, #78] ; y[5] = t[0]-t[5] |
michael@0 | 305 | STRH r12,[r0, #94] ; y[6] = t[0]-t[6] |
michael@0 | 306 | STRH r11,[r0, #110] ; y[7] = t[0]-t[7] |
michael@0 | 307 | MOV PC,r14 |
michael@0 | 308 | ENDP |
michael@0 | 309 | |
michael@0 | 310 | idct3core_arm PROC |
michael@0 | 311 | LDRSH r9, [r1], #16 ; r9 = x[0] |
michael@0 | 312 | LDR r12,OC_C4S4 ; r12= OC_C4S4 |
michael@0 | 313 | LDRSH r3, [r1, #-12] ; r3 = x[2] |
michael@0 | 314 | LDR r10,OC_C6S2 ; r10= OC_C6S2 |
michael@0 | 315 | MUL r9, r12,r9 ; r9 = t[0]<<16 = OC_C4S4*x[0] |
michael@0 | 316 | LDR r4, OC_C2S6 ; r4 = OC_C2S6 |
michael@0 | 317 | MUL r10,r3, r10 ; r10= t[2]<<16 = OC_C6S2*x[2] |
michael@0 | 318 | LDRSH r11,[r1, #-14] ; r11= x[1] |
michael@0 | 319 | MUL r3, r4, r3 ; r3 = t[3]<<16 = OC_C2S6*x[2] |
michael@0 | 320 | LDR r4, OC_C7S1 ; r4 = OC_C7S1 |
michael@0 | 321 | LDR r5, OC_C1S7 ; r5 = OC_C1S7 |
michael@0 | 322 | MOV r9, r9, ASR #16 ; r9 = t[0] |
michael@0 | 323 | MUL r4, r11,r4 ; r4 = t[4]<<16 = OC_C7S1*x[1] |
michael@0 | 324 | ADD r3, r9, r3, ASR #16 ; r3 = t[0]+t[3] |
michael@0 | 325 | MUL r11,r5, r11 ; r11= t[7]<<16 = OC_C1S7*x[1] |
michael@0 | 326 | MOV r4, r4, ASR #16 ; r4 = t[4] |
michael@0 | 327 | MUL r5, r12,r4 ; r5 = t[5]<<16 = OC_C4S4*t[4] |
michael@0 | 328 | MOV r11,r11,ASR #16 ; r11= t[7] |
michael@0 | 329 | MUL r12,r11,r12 ; r12= t[6]<<16 = OC_C4S4*t[7] |
michael@0 | 330 | ADD r10,r9, r10,ASR #16 ; r10= t[1] = t[0]+t[2] |
michael@0 | 331 | RSB r6, r10,r9, LSL #1 ; r6 = t[2] = t[0]-t[2] |
michael@0 | 332 | ; r3 = t2[0] = t[0]+t[3] |
michael@0 | 333 | RSB r9, r3, r9, LSL #1 ; r9 = t2[3] = t[0]-t[3] |
michael@0 | 334 | MOV r12,r12,ASR #16 ; r12= t[6] |
michael@0 | 335 | ADD r5, r12,r5, ASR #16 ; r5 = t2[6] = t[6]+t[5] |
michael@0 | 336 | RSB r12,r5, r12,LSL #1 ; r12= t2[5] = t[6]-t[5] |
michael@0 | 337 | ADD r11,r3, r11 ; r11= t2[0]+t[7] |
michael@0 | 338 | ADD r5, r10,r5 ; r5 = t[1]+t2[6] |
michael@0 | 339 | ADD r12,r6, r12 ; r12= t[2]+t2[5] |
michael@0 | 340 | ADD r4, r9, r4 ; r4 = t2[3]+t[4] |
michael@0 | 341 | STRH r11,[r0], #2 ; y[0] = t[0]+t[7] |
michael@0 | 342 | STRH r5, [r0, #14] ; y[1] = t[1]+t2[6] |
michael@0 | 343 | STRH r12,[r0, #30] ; y[2] = t[2]+t2[5] |
michael@0 | 344 | STRH r4, [r0, #46] ; y[3] = t2[3]+t[4] |
michael@0 | 345 | RSB r11,r11,r3, LSL #1 ; r11= t2[0] - t[7] |
michael@0 | 346 | RSB r5, r5, r10,LSL #1 ; r5 = t[1] - t2[6] |
michael@0 | 347 | RSB r12,r12,r6, LSL #1 ; r6 = t[2] - t2[5] |
michael@0 | 348 | RSB r4, r4, r9, LSL #1 ; r4 = t2[3] - t[4] |
michael@0 | 349 | STRH r4, [r0, #62] ; y[4] = t2[3]-t[4] |
michael@0 | 350 | STRH r12,[r0, #78] ; y[5] = t[2]-t2[5] |
michael@0 | 351 | STRH r5, [r0, #94] ; y[6] = t[1]-t2[6] |
michael@0 | 352 | STRH r11,[r0, #110] ; y[7] = t2[0]-t[7] |
michael@0 | 353 | MOV PC,R14 |
michael@0 | 354 | ENDP |
michael@0 | 355 | |
michael@0 | 356 | idct3core_down_arm PROC |
michael@0 | 357 | LDRSH r9, [r1], #16 ; r9 = x[0] |
michael@0 | 358 | LDR r12,OC_C4S4 ; r12= OC_C4S4 |
michael@0 | 359 | LDRSH r3, [r1, #-12] ; r3 = x[2] |
michael@0 | 360 | LDR r10,OC_C6S2 ; r10= OC_C6S2 |
michael@0 | 361 | MUL r9, r12,r9 ; r9 = t[0]<<16 = OC_C4S4*x[0] |
michael@0 | 362 | LDR r4, OC_C2S6 ; r4 = OC_C2S6 |
michael@0 | 363 | MUL r10,r3, r10 ; r10= t[2]<<16 = OC_C6S2*x[2] |
michael@0 | 364 | LDRSH r11,[r1, #-14] ; r11= x[1] |
michael@0 | 365 | MUL r3, r4, r3 ; r3 = t[3]<<16 = OC_C2S6*x[2] |
michael@0 | 366 | LDR r4, OC_C7S1 ; r4 = OC_C7S1 |
michael@0 | 367 | LDR r5, OC_C1S7 ; r5 = OC_C1S7 |
michael@0 | 368 | MOV r9, r9, ASR #16 ; r9 = t[0] |
michael@0 | 369 | MUL r4, r11,r4 ; r4 = t[4]<<16 = OC_C7S1*x[1] |
michael@0 | 370 | ADD r9, r9, #8 ; r9 = t[0]+8 |
michael@0 | 371 | MUL r11,r5, r11 ; r11= t[7]<<16 = OC_C1S7*x[1] |
michael@0 | 372 | ADD r3, r9, r3, ASR #16 ; r3 = t[0]+t[3]+8 |
michael@0 | 373 | MOV r4, r4, ASR #16 ; r4 = t[4] |
michael@0 | 374 | MUL r5, r12,r4 ; r5 = t[5]<<16 = OC_C4S4*t[4] |
michael@0 | 375 | MOV r11,r11,ASR #16 ; r11= t[7] |
michael@0 | 376 | MUL r12,r11,r12 ; r12= t[6]<<16 = OC_C4S4*t[7] |
michael@0 | 377 | ADD r10,r9, r10,ASR #16 ; r10= t[1]+8 = t[0]+t[2]+8 |
michael@0 | 378 | RSB r6, r10,r9, LSL #1 ; r6 = t[2]+8 = t[0]-t[2]+8 |
michael@0 | 379 | ; r3 = t2[0]+8 = t[0]+t[3]+8 |
michael@0 | 380 | RSB r9, r3, r9, LSL #1 ; r9 = t2[3]+8 = t[0]-t[3]+8 |
michael@0 | 381 | MOV r12,r12,ASR #16 ; r12= t[6] |
michael@0 | 382 | ADD r5, r12,r5, ASR #16 ; r5 = t2[6] = t[6]+t[5] |
michael@0 | 383 | RSB r12,r5, r12,LSL #1 ; r12= t2[5] = t[6]-t[5] |
michael@0 | 384 | ADD r11,r3, r11 ; r11= t2[0]+t[7] +8 |
michael@0 | 385 | ADD r5, r10,r5 ; r5 = t[1] +t2[6]+8 |
michael@0 | 386 | ADD r12,r6, r12 ; r12= t[2] +t2[5]+8 |
michael@0 | 387 | ADD r4, r9, r4 ; r4 = t2[3]+t[4] +8 |
michael@0 | 388 | RSB r3, r11,r3, LSL #1 ; r11= t2[0] - t[7] + 8 |
michael@0 | 389 | RSB r10,r5, r10,LSL #1 ; r5 = t[1] - t2[6] + 8 |
michael@0 | 390 | RSB r6, r12,r6, LSL #1 ; r6 = t[2] - t2[5] + 8 |
michael@0 | 391 | RSB r9, r4, r9, LSL #1 ; r4 = t2[3] - t[4] + 8 |
michael@0 | 392 | ; TODO: This is wrong. |
michael@0 | 393 | ; The C code truncates to 16 bits by storing to RAM and doing the |
michael@0 | 394 | ; shifts later; we've got an extra 4 bits here. |
michael@0 | 395 | MOV r11,r11,ASR #4 |
michael@0 | 396 | MOV r5, r5, ASR #4 |
michael@0 | 397 | MOV r12,r12,ASR #4 |
michael@0 | 398 | MOV r4, r4, ASR #4 |
michael@0 | 399 | MOV r9, r9, ASR #4 |
michael@0 | 400 | MOV r6, r6, ASR #4 |
michael@0 | 401 | MOV r10,r10,ASR #4 |
michael@0 | 402 | MOV r3, r3, ASR #4 |
michael@0 | 403 | STRH r11,[r0], #2 ; y[0] = t[0]+t[7] |
michael@0 | 404 | STRH r5, [r0, #14] ; y[1] = t[1]+t2[6] |
michael@0 | 405 | STRH r12,[r0, #30] ; y[2] = t[2]+t2[5] |
michael@0 | 406 | STRH r4, [r0, #46] ; y[3] = t2[3]+t[4] |
michael@0 | 407 | STRH r9, [r0, #62] ; y[4] = t2[3]-t[4] |
michael@0 | 408 | STRH r6, [r0, #78] ; y[5] = t[2]-t2[5] |
michael@0 | 409 | STRH r10,[r0, #94] ; y[6] = t[1]-t2[6] |
michael@0 | 410 | STRH r3, [r0, #110] ; y[7] = t2[0]-t[7] |
michael@0 | 411 | MOV PC,R14 |
michael@0 | 412 | ENDP |
michael@0 | 413 | |
michael@0 | 414 | idct4core_arm PROC |
michael@0 | 415 | ; r0 = ogg_int16_t *_y (destination) |
michael@0 | 416 | ; r1 = const ogg_int16_t *_x (source) |
michael@0 | 417 | LDRSH r9, [r1], #16 ; r9 = x[0] |
michael@0 | 418 | LDR r10,OC_C4S4 ; r10= OC_C4S4 |
michael@0 | 419 | LDRSH r12,[r1, #-12] ; r12= x[2] |
michael@0 | 420 | LDR r4, OC_C6S2 ; r4 = OC_C6S2 |
michael@0 | 421 | MUL r9, r10,r9 ; r9 = t[0]<<16 = OC_C4S4*x[0] |
michael@0 | 422 | LDR r5, OC_C2S6 ; r5 = OC_C2S6 |
michael@0 | 423 | MUL r4, r12,r4 ; r4 = t[2]<<16 = OC_C6S2*x[2] |
michael@0 | 424 | LDRSH r3, [r1, #-14] ; r3 = x[1] |
michael@0 | 425 | MUL r5, r12,r5 ; r5 = t[3]<<16 = OC_C2S6*x[2] |
michael@0 | 426 | LDR r6, OC_C7S1 ; r6 = OC_C7S1 |
michael@0 | 427 | LDR r12,OC_C1S7 ; r12= OC_C1S7 |
michael@0 | 428 | LDRSH r11,[r1, #-10] ; r11= x[3] |
michael@0 | 429 | MUL r6, r3, r6 ; r6 = t[4]<<16 = OC_C7S1*x[1] |
michael@0 | 430 | LDR r7, OC_C5S3 ; r7 = OC_C5S3 |
michael@0 | 431 | MUL r3, r12,r3 ; r3 = t[7]<<16 = OC_C1S7*x[1] |
michael@0 | 432 | LDR r8, OC_C3S5 ; r8 = OC_C3S5 |
michael@0 | 433 | MUL r7, r11,r7 ; r7 = -t[5]<<16 = OC_C5S3*x[3] |
michael@0 | 434 | MOV r9, r9, ASR #16 ; r9 = t[0] |
michael@0 | 435 | MUL r11,r8, r11 ; r11= t[6]<<16 = OC_C3S5*x[3] |
michael@0 | 436 | MOV r6, r6, ASR #16 ; r6 = t[4] |
michael@0 | 437 | ; TODO: This is wrong; t[4]-t[5] and t[7]-t[6] need to be truncated to 16-bit |
michael@0 | 438 | ; before multiplying, not after (this is not equivalent) |
michael@0 | 439 | SUB r7, r6, r7, ASR #16 ; r7 = t2[4]=t[4]+t[5] (as r7=-t[5]) |
michael@0 | 440 | RSB r6, r7, r6, LSL #1 ; r6 = t[4]-t[5] |
michael@0 | 441 | MUL r6, r10,r6 ; r6 = t2[5]<<16 =OC_C4S4*(t[4]-t[5]) |
michael@0 | 442 | MOV r3, r3, ASR #16 ; r3 = t[7] |
michael@0 | 443 | ADD r11,r3, r11,ASR #16 ; r11= t2[7]=t[7]+t[6] |
michael@0 | 444 | RSB r3, r11,r3, LSL #1 ; r3 = t[7]-t[6] |
michael@0 | 445 | MUL r3, r10,r3 ; r3 = t2[6]<<16 =OC_C4S4*(t[7]-t[6]) |
michael@0 | 446 | ADD r4, r9, r4, ASR #16 ; r4 = t[1] = t[0] + t[2] |
michael@0 | 447 | RSB r10,r4, r9, LSL #1 ; r10= t[2] = t[0] - t[2] |
michael@0 | 448 | ADD r5, r9, r5, ASR #16 ; r5 = t[0] = t[0] + t[3] |
michael@0 | 449 | RSB r9, r5, r9, LSL #1 ; r9 = t[3] = t[0] - t[3] |
michael@0 | 450 | MOV r3, r3, ASR #16 ; r3 = t2[6] |
michael@0 | 451 | ADD r6, r3, r6, ASR #16 ; r6 = t3[6] = t2[6]+t2[5] |
michael@0 | 452 | RSB r3, r6, r3, LSL #1 ; r3 = t3[5] = t2[6]-t2[5] |
michael@0 | 453 | ADD r11,r5, r11 ; r11= t[0]+t2[7] |
michael@0 | 454 | ADD r6, r4, r6 ; r6 = t[1]+t3[6] |
michael@0 | 455 | ADD r3, r10,r3 ; r3 = t[2]+t3[5] |
michael@0 | 456 | ADD r7, r9, r7 ; r7 = t[3]+t2[4] |
michael@0 | 457 | STRH r11,[r0], #2 ; y[0] = t[0]+t[7] |
michael@0 | 458 | STRH r6, [r0, #14] ; y[1] = t[1]+t2[6] |
michael@0 | 459 | STRH r3, [r0, #30] ; y[2] = t[2]+t2[5] |
michael@0 | 460 | STRH r7, [r0, #46] ; y[3] = t2[3]+t[4] |
michael@0 | 461 | RSB r11,r11,r5, LSL #1 ; r11= t[0]-t2[7] |
michael@0 | 462 | RSB r6, r6, r4, LSL #1 ; r6 = t[1]-t3[6] |
michael@0 | 463 | RSB r3, r3, r10,LSL #1 ; r3 = t[2]-t3[5] |
michael@0 | 464 | RSB r7, r7, r9, LSL #1 ; r7 = t[3]-t2[4] |
michael@0 | 465 | STRH r7, [r0, #62] ; y[4] = t2[3]-t[4] |
michael@0 | 466 | STRH r3, [r0, #78] ; y[5] = t[2]-t2[5] |
michael@0 | 467 | STRH r6, [r0, #94] ; y[6] = t[1]-t2[6] |
michael@0 | 468 | STRH r11, [r0, #110] ; y[7] = t2[0]-t[7] |
michael@0 | 469 | MOV PC,r14 |
michael@0 | 470 | ENDP |
michael@0 | 471 | |
michael@0 | 472 | idct4core_down_arm PROC |
michael@0 | 473 | ; r0 = ogg_int16_t *_y (destination) |
michael@0 | 474 | ; r1 = const ogg_int16_t *_x (source) |
michael@0 | 475 | LDRSH r9, [r1], #16 ; r9 = x[0] |
michael@0 | 476 | LDR r10,OC_C4S4 ; r10= OC_C4S4 |
michael@0 | 477 | LDRSH r12,[r1, #-12] ; r12= x[2] |
michael@0 | 478 | LDR r4, OC_C6S2 ; r4 = OC_C6S2 |
michael@0 | 479 | MUL r9, r10,r9 ; r9 = t[0]<<16 = OC_C4S4*x[0] |
michael@0 | 480 | LDR r5, OC_C2S6 ; r5 = OC_C2S6 |
michael@0 | 481 | MUL r4, r12,r4 ; r4 = t[2]<<16 = OC_C6S2*x[2] |
michael@0 | 482 | LDRSH r3, [r1, #-14] ; r3 = x[1] |
michael@0 | 483 | MUL r5, r12,r5 ; r5 = t[3]<<16 = OC_C2S6*x[2] |
michael@0 | 484 | LDR r6, OC_C7S1 ; r6 = OC_C7S1 |
michael@0 | 485 | LDR r12,OC_C1S7 ; r12= OC_C1S7 |
michael@0 | 486 | LDRSH r11,[r1, #-10] ; r11= x[3] |
michael@0 | 487 | MUL r6, r3, r6 ; r6 = t[4]<<16 = OC_C7S1*x[1] |
michael@0 | 488 | LDR r7, OC_C5S3 ; r7 = OC_C5S3 |
michael@0 | 489 | MUL r3, r12,r3 ; r3 = t[7]<<16 = OC_C1S7*x[1] |
michael@0 | 490 | LDR r8, OC_C3S5 ; r8 = OC_C3S5 |
michael@0 | 491 | MUL r7, r11,r7 ; r7 = -t[5]<<16 = OC_C5S3*x[3] |
michael@0 | 492 | MOV r9, r9, ASR #16 ; r9 = t[0] |
michael@0 | 493 | MUL r11,r8, r11 ; r11= t[6]<<16 = OC_C3S5*x[3] |
michael@0 | 494 | MOV r6, r6, ASR #16 ; r6 = t[4] |
michael@0 | 495 | ; TODO: This is wrong; t[4]-t[5] and t[7]-t[6] need to be truncated to 16-bit |
michael@0 | 496 | ; before multiplying, not after (this is not equivalent) |
michael@0 | 497 | SUB r7, r6, r7, ASR #16 ; r7 = t2[4]=t[4]+t[5] (as r7=-t[5]) |
michael@0 | 498 | RSB r6, r7, r6, LSL #1 ; r6 = t[4]-t[5] |
michael@0 | 499 | MUL r6, r10,r6 ; r6 = t2[5]<<16 =OC_C4S4*(t[4]-t[5]) |
michael@0 | 500 | MOV r3, r3, ASR #16 ; r3 = t[7] |
michael@0 | 501 | ADD r11,r3, r11,ASR #16 ; r11= t2[7]=t[7]+t[6] |
michael@0 | 502 | RSB r3, r11,r3, LSL #1 ; r3 = t[7]-t[6] |
michael@0 | 503 | ADD r9, r9, #8 ; r9 = t[0]+8 |
michael@0 | 504 | MUL r3, r10,r3 ; r3 = t2[6]<<16 =OC_C4S4*(t[7]-t[6]) |
michael@0 | 505 | ADD r4, r9, r4, ASR #16 ; r4 = t[1] = t[0] + t[2] + 8 |
michael@0 | 506 | RSB r10,r4, r9, LSL #1 ; r10= t[2] = t[0] - t[2] + 8 |
michael@0 | 507 | ADD r5, r9, r5, ASR #16 ; r5 = t[0] = t[0] + t[3] + 8 |
michael@0 | 508 | RSB r9, r5, r9, LSL #1 ; r9 = t[3] = t[0] - t[3] + 8 |
michael@0 | 509 | MOV r3, r3, ASR #16 ; r3 = t2[6] |
michael@0 | 510 | ADD r6, r3, r6, ASR #16 ; r6 = t3[6] = t2[6]+t2[5] |
michael@0 | 511 | RSB r3, r6, r3, LSL #1 ; r3 = t3[5] = t2[6]-t2[5] |
michael@0 | 512 | ADD r5, r5, r11 ; r5 = t[0]+t2[7]+8 |
michael@0 | 513 | ADD r4, r4, r6 ; r4 = t[1]+t3[6]+8 |
michael@0 | 514 | ADD r10,r10,r3 ; r10= t[2]+t3[5]+8 |
michael@0 | 515 | ADD r9, r9, r7 ; r9 = t[3]+t2[4]+8 |
michael@0 | 516 | SUB r11,r5, r11,LSL #1 ; r11= t[0]-t2[7]+8 |
michael@0 | 517 | SUB r6, r4, r6, LSL #1 ; r6 = t[1]-t3[6]+8 |
michael@0 | 518 | SUB r3, r10,r3, LSL #1 ; r3 = t[2]-t3[5]+8 |
michael@0 | 519 | SUB r7, r9, r7, LSL #1 ; r7 = t[3]-t2[4]+8 |
michael@0 | 520 | ; TODO: This is wrong. |
michael@0 | 521 | ; The C code truncates to 16 bits by storing to RAM and doing the |
michael@0 | 522 | ; shifts later; we've got an extra 4 bits here. |
michael@0 | 523 | MOV r11,r11,ASR #4 |
michael@0 | 524 | MOV r6, r6, ASR #4 |
michael@0 | 525 | MOV r3, r3, ASR #4 |
michael@0 | 526 | MOV r7, r7, ASR #4 |
michael@0 | 527 | MOV r9, r9, ASR #4 |
michael@0 | 528 | MOV r10,r10,ASR #4 |
michael@0 | 529 | MOV r4, r4, ASR #4 |
michael@0 | 530 | MOV r5, r5, ASR #4 |
michael@0 | 531 | STRH r5,[r0], #2 ; y[0] = t[0]+t[7] |
michael@0 | 532 | STRH r4, [r0, #14] ; y[1] = t[1]+t2[6] |
michael@0 | 533 | STRH r10,[r0, #30] ; y[2] = t[2]+t2[5] |
michael@0 | 534 | STRH r9, [r0, #46] ; y[3] = t2[3]+t[4] |
michael@0 | 535 | STRH r7, [r0, #62] ; y[4] = t2[3]-t[4] |
michael@0 | 536 | STRH r3, [r0, #78] ; y[5] = t[2]-t2[5] |
michael@0 | 537 | STRH r6, [r0, #94] ; y[6] = t[1]-t2[6] |
michael@0 | 538 | STRH r11,[r0, #110] ; y[7] = t2[0]-t[7] |
michael@0 | 539 | MOV PC,r14 |
michael@0 | 540 | ENDP |
michael@0 | 541 | |
michael@0 | 542 | idct8core_arm PROC |
michael@0 | 543 | ; r0 = ogg_int16_t *_y (destination) |
michael@0 | 544 | ; r1 = const ogg_int16_t *_x (source) |
michael@0 | 545 | LDRSH r2, [r1],#16 ; r2 = x[0] |
michael@0 | 546 | STMFD r13!,{r1,r14} |
michael@0 | 547 | LDRSH r6, [r1, #-8] ; r6 = x[4] |
michael@0 | 548 | LDR r12,OC_C4S4 ; r12= C4S4 |
michael@0 | 549 | LDRSH r4, [r1, #-12] ; r4 = x[2] |
michael@0 | 550 | ADD r2, r2, r6 ; r2 = x[0] + x[4] |
michael@0 | 551 | SUB r6, r2, r6, LSL #1 ; r6 = x[0] - x[4] |
michael@0 | 552 | ; For spec compliance, these sums must be truncated to 16-bit precision |
michael@0 | 553 | ; _before_ the multiply (not after). |
michael@0 | 554 | ; Sadly, ARMv4 provides no simple way to do that. |
michael@0 | 555 | MOV r2, r2, LSL #16 |
michael@0 | 556 | MOV r6, r6, LSL #16 |
michael@0 | 557 | MOV r2, r2, ASR #16 |
michael@0 | 558 | MOV r6, r6, ASR #16 |
michael@0 | 559 | MUL r2, r12,r2 ; r2 = t[0]<<16 = C4S4*(x[0]+x[4]) |
michael@0 | 560 | LDRSH r8, [r1, #-4] ; r8 = x[6] |
michael@0 | 561 | LDR r7, OC_C6S2 ; r7 = OC_C6S2 |
michael@0 | 562 | MUL r6, r12,r6 ; r6 = t[1]<<16 = C4S4*(x[0]-x[4]) |
michael@0 | 563 | LDR r14,OC_C2S6 ; r14= OC_C2S6 |
michael@0 | 564 | MUL r3, r4, r7 ; r3 = OC_C6S2*x[2] |
michael@0 | 565 | LDR r5, OC_C7S1 ; r5 = OC_C7S1 |
michael@0 | 566 | MUL r4, r14,r4 ; r4 = OC_C2S6*x[2] |
michael@0 | 567 | MOV r3, r3, ASR #16 ; r3 = OC_C6S2*x[2]>>16 |
michael@0 | 568 | MUL r14,r8, r14 ; r14= OC_C2S6*x[6] |
michael@0 | 569 | MOV r4, r4, ASR #16 ; r4 = OC_C2S6*x[2]>>16 |
michael@0 | 570 | MUL r8, r7, r8 ; r8 = OC_C6S2*x[6] |
michael@0 | 571 | LDR r7, OC_C1S7 ; r7 = OC_C1S7 |
michael@0 | 572 | SUB r3, r3, r14,ASR #16 ; r3=t[2]=C6S2*x[2]>>16-C2S6*x[6]>>16 |
michael@0 | 573 | LDRSH r14,[r1, #-14] ; r14= x[1] |
michael@0 | 574 | ADD r4, r4, r8, ASR #16 ; r4=t[3]=C2S6*x[2]>>16+C6S2*x[6]>>16 |
michael@0 | 575 | LDRSH r8, [r1, #-2] ; r8 = x[7] |
michael@0 | 576 | MUL r9, r5, r14 ; r9 = OC_C7S1*x[1] |
michael@0 | 577 | LDRSH r10,[r1, #-6] ; r10= x[5] |
michael@0 | 578 | MUL r14,r7, r14 ; r14= OC_C1S7*x[1] |
michael@0 | 579 | MOV r9, r9, ASR #16 ; r9 = OC_C7S1*x[1]>>16 |
michael@0 | 580 | MUL r7, r8, r7 ; r7 = OC_C1S7*x[7] |
michael@0 | 581 | MOV r14,r14,ASR #16 ; r14= OC_C1S7*x[1]>>16 |
michael@0 | 582 | MUL r8, r5, r8 ; r8 = OC_C7S1*x[7] |
michael@0 | 583 | LDRSH r1, [r1, #-10] ; r1 = x[3] |
michael@0 | 584 | LDR r5, OC_C3S5 ; r5 = OC_C3S5 |
michael@0 | 585 | LDR r11,OC_C5S3 ; r11= OC_C5S3 |
michael@0 | 586 | ADD r8, r14,r8, ASR #16 ; r8=t[7]=C1S7*x[1]>>16+C7S1*x[7]>>16 |
michael@0 | 587 | MUL r14,r5, r10 ; r14= OC_C3S5*x[5] |
michael@0 | 588 | SUB r9, r9, r7, ASR #16 ; r9=t[4]=C7S1*x[1]>>16-C1S7*x[7]>>16 |
michael@0 | 589 | MUL r10,r11,r10 ; r10= OC_C5S3*x[5] |
michael@0 | 590 | MOV r14,r14,ASR #16 ; r14= OC_C3S5*x[5]>>16 |
michael@0 | 591 | MUL r11,r1, r11 ; r11= OC_C5S3*x[3] |
michael@0 | 592 | MOV r10,r10,ASR #16 ; r10= OC_C5S3*x[5]>>16 |
michael@0 | 593 | MUL r1, r5, r1 ; r1 = OC_C3S5*x[3] |
michael@0 | 594 | SUB r14,r14,r11,ASR #16 ;r14=t[5]=C3S5*x[5]>>16-C5S3*x[3]>>16 |
michael@0 | 595 | ADD r10,r10,r1, ASR #16 ;r10=t[6]=C5S3*x[5]>>16+C3S5*x[3]>>16 |
michael@0 | 596 | ; r2=t[0]<<16 r3=t[2] r4=t[3] r6=t[1]<<16 r8=t[7] r9=t[4] |
michael@0 | 597 | ; r10=t[6] r12=C4S4 r14=t[5] |
michael@0 | 598 | ; TODO: This is wrong; t[4]-t[5] and t[7]-t[6] need to be truncated to 16-bit |
michael@0 | 599 | ; before multiplying, not after (this is not equivalent) |
michael@0 | 600 | ; Stage 2 |
michael@0 | 601 | ; 4-5 butterfly |
michael@0 | 602 | ADD r9, r9, r14 ; r9 = t2[4] = t[4]+t[5] |
michael@0 | 603 | SUB r14,r9, r14, LSL #1 ; r14= t[4]-t[5] |
michael@0 | 604 | MUL r14,r12,r14 ; r14= t2[5]<<16 = C4S4*(t[4]-t[5]) |
michael@0 | 605 | ; 7-6 butterfly |
michael@0 | 606 | ADD r8, r8, r10 ; r8 = t2[7] = t[7]+t[6] |
michael@0 | 607 | SUB r10,r8, r10, LSL #1 ; r10= t[7]-t[6] |
michael@0 | 608 | MUL r10,r12,r10 ; r10= t2[6]<<16 = C4S4*(t[7]+t[6]) |
michael@0 | 609 | ; r2=t[0]<<16 r3=t[2] r4=t[3] r6=t[1]<<16 r8=t2[7] r9=t2[4] |
michael@0 | 610 | ; r10=t2[6]<<16 r12=C4S4 r14=t2[5]<<16 |
michael@0 | 611 | ; Stage 3 |
michael@0 | 612 | ; 0-3 butterfly |
michael@0 | 613 | ADD r2, r4, r2, ASR #16 ; r2 = t2[0] = t[0] + t[3] |
michael@0 | 614 | SUB r4, r2, r4, LSL #1 ; r4 = t2[3] = t[0] - t[3] |
michael@0 | 615 | ; 1-2 butterfly |
michael@0 | 616 | ADD r6, r3, r6, ASR #16 ; r6 = t2[1] = t[1] + t[2] |
michael@0 | 617 | SUB r3, r6, r3, LSL #1 ; r3 = t2[2] = t[1] - t[2] |
michael@0 | 618 | ; 6-5 butterfly |
michael@0 | 619 | MOV r14,r14,ASR #16 ; r14= t2[5] |
michael@0 | 620 | ADD r10,r14,r10,ASR #16 ; r10= t3[6] = t[6] + t[5] |
michael@0 | 621 | SUB r14,r10,r14,LSL #1 ; r14= t3[5] = t[6] - t[5] |
michael@0 | 622 | ; r2=t2[0] r3=t2[2] r4=t2[3] r6=t2[1] r8=t2[7] r9=t2[4] |
michael@0 | 623 | ; r10=t3[6] r14=t3[5] |
michael@0 | 624 | ; Stage 4 |
michael@0 | 625 | ADD r2, r2, r8 ; r2 = t[0] + t[7] |
michael@0 | 626 | ADD r6, r6, r10 ; r6 = t[1] + t[6] |
michael@0 | 627 | ADD r3, r3, r14 ; r3 = t[2] + t[5] |
michael@0 | 628 | ADD r4, r4, r9 ; r4 = t[3] + t[4] |
michael@0 | 629 | SUB r8, r2, r8, LSL #1 ; r8 = t[0] - t[7] |
michael@0 | 630 | SUB r10,r6, r10,LSL #1 ; r10= t[1] - t[6] |
michael@0 | 631 | SUB r14,r3, r14,LSL #1 ; r14= t[2] - t[5] |
michael@0 | 632 | SUB r9, r4, r9, LSL #1 ; r9 = t[3] - t[4] |
michael@0 | 633 | STRH r2, [r0], #2 ; y[0] = t[0]+t[7] |
michael@0 | 634 | STRH r6, [r0, #14] ; y[1] = t[1]+t[6] |
michael@0 | 635 | STRH r3, [r0, #30] ; y[2] = t[2]+t[5] |
michael@0 | 636 | STRH r4, [r0, #46] ; y[3] = t[3]+t[4] |
michael@0 | 637 | STRH r9, [r0, #62] ; y[4] = t[3]-t[4] |
michael@0 | 638 | STRH r14,[r0, #78] ; y[5] = t[2]-t[5] |
michael@0 | 639 | STRH r10,[r0, #94] ; y[6] = t[1]-t[6] |
michael@0 | 640 | STRH r8, [r0, #110] ; y[7] = t[0]-t[7] |
michael@0 | 641 | LDMFD r13!,{r1,PC} |
michael@0 | 642 | ENDP |
michael@0 | 643 | |
michael@0 | 644 | idct8core_down_arm PROC |
michael@0 | 645 | ; r0 = ogg_int16_t *_y (destination) |
michael@0 | 646 | ; r1 = const ogg_int16_t *_x (source) |
michael@0 | 647 | LDRSH r2, [r1],#16 ; r2 = x[0] |
michael@0 | 648 | STMFD r13!,{r1,r14} |
michael@0 | 649 | LDRSH r6, [r1, #-8] ; r6 = x[4] |
michael@0 | 650 | LDR r12,OC_C4S4 ; r12= C4S4 |
michael@0 | 651 | LDRSH r4, [r1, #-12] ; r4 = x[2] |
michael@0 | 652 | ADD r2, r2, r6 ; r2 = x[0] + x[4] |
michael@0 | 653 | SUB r6, r2, r6, LSL #1 ; r6 = x[0] - x[4] |
michael@0 | 654 | ; For spec compliance, these sums must be truncated to 16-bit precision |
michael@0 | 655 | ; _before_ the multiply (not after). |
michael@0 | 656 | ; Sadly, ARMv4 provides no simple way to do that. |
michael@0 | 657 | MOV r2, r2, LSL #16 |
michael@0 | 658 | MOV r6, r6, LSL #16 |
michael@0 | 659 | MOV r2, r2, ASR #16 |
michael@0 | 660 | MOV r6, r6, ASR #16 |
michael@0 | 661 | MUL r2, r12,r2 ; r2 = t[0]<<16 = C4S4*(x[0]+x[4]) |
michael@0 | 662 | LDRSH r8, [r1, #-4] ; r8 = x[6] |
michael@0 | 663 | LDR r7, OC_C6S2 ; r7 = OC_C6S2 |
michael@0 | 664 | MUL r6, r12,r6 ; r6 = t[1]<<16 = C4S4*(x[0]-x[4]) |
michael@0 | 665 | LDR r14,OC_C2S6 ; r14= OC_C2S6 |
michael@0 | 666 | MUL r3, r4, r7 ; r3 = OC_C6S2*x[2] |
michael@0 | 667 | LDR r5, OC_C7S1 ; r5 = OC_C7S1 |
michael@0 | 668 | MUL r4, r14,r4 ; r4 = OC_C2S6*x[2] |
michael@0 | 669 | MOV r3, r3, ASR #16 ; r3 = OC_C6S2*x[2]>>16 |
michael@0 | 670 | MUL r14,r8, r14 ; r14= OC_C2S6*x[6] |
michael@0 | 671 | MOV r4, r4, ASR #16 ; r4 = OC_C2S6*x[2]>>16 |
michael@0 | 672 | MUL r8, r7, r8 ; r8 = OC_C6S2*x[6] |
michael@0 | 673 | LDR r7, OC_C1S7 ; r7 = OC_C1S7 |
michael@0 | 674 | SUB r3, r3, r14,ASR #16 ; r3=t[2]=C6S2*x[2]>>16-C2S6*x[6]>>16 |
michael@0 | 675 | LDRSH r14,[r1, #-14] ; r14= x[1] |
michael@0 | 676 | ADD r4, r4, r8, ASR #16 ; r4=t[3]=C2S6*x[2]>>16+C6S2*x[6]>>16 |
michael@0 | 677 | LDRSH r8, [r1, #-2] ; r8 = x[7] |
michael@0 | 678 | MUL r9, r5, r14 ; r9 = OC_C7S1*x[1] |
michael@0 | 679 | LDRSH r10,[r1, #-6] ; r10= x[5] |
michael@0 | 680 | MUL r14,r7, r14 ; r14= OC_C1S7*x[1] |
michael@0 | 681 | MOV r9, r9, ASR #16 ; r9 = OC_C7S1*x[1]>>16 |
michael@0 | 682 | MUL r7, r8, r7 ; r7 = OC_C1S7*x[7] |
michael@0 | 683 | MOV r14,r14,ASR #16 ; r14= OC_C1S7*x[1]>>16 |
michael@0 | 684 | MUL r8, r5, r8 ; r8 = OC_C7S1*x[7] |
michael@0 | 685 | LDRSH r1, [r1, #-10] ; r1 = x[3] |
michael@0 | 686 | LDR r5, OC_C3S5 ; r5 = OC_C3S5 |
michael@0 | 687 | LDR r11,OC_C5S3 ; r11= OC_C5S3 |
michael@0 | 688 | ADD r8, r14,r8, ASR #16 ; r8=t[7]=C1S7*x[1]>>16+C7S1*x[7]>>16 |
michael@0 | 689 | MUL r14,r5, r10 ; r14= OC_C3S5*x[5] |
michael@0 | 690 | SUB r9, r9, r7, ASR #16 ; r9=t[4]=C7S1*x[1]>>16-C1S7*x[7]>>16 |
michael@0 | 691 | MUL r10,r11,r10 ; r10= OC_C5S3*x[5] |
michael@0 | 692 | MOV r14,r14,ASR #16 ; r14= OC_C3S5*x[5]>>16 |
michael@0 | 693 | MUL r11,r1, r11 ; r11= OC_C5S3*x[3] |
michael@0 | 694 | MOV r10,r10,ASR #16 ; r10= OC_C5S3*x[5]>>16 |
michael@0 | 695 | MUL r1, r5, r1 ; r1 = OC_C3S5*x[3] |
michael@0 | 696 | SUB r14,r14,r11,ASR #16 ;r14=t[5]=C3S5*x[5]>>16-C5S3*x[3]>>16 |
michael@0 | 697 | ADD r10,r10,r1, ASR #16 ;r10=t[6]=C5S3*x[5]>>16+C3S5*x[3]>>16 |
michael@0 | 698 | ; r2=t[0]<<16 r3=t[2] r4=t[3] r6=t[1]<<16 r8=t[7] r9=t[4] |
michael@0 | 699 | ; r10=t[6] r12=C4S4 r14=t[5] |
michael@0 | 700 | ; Stage 2 |
michael@0 | 701 | ; TODO: This is wrong; t[4]-t[5] and t[7]-t[6] need to be truncated to 16-bit |
michael@0 | 702 | ; before multiplying, not after (this is not equivalent) |
michael@0 | 703 | ; 4-5 butterfly |
michael@0 | 704 | ADD r9, r9, r14 ; r9 = t2[4] = t[4]+t[5] |
michael@0 | 705 | SUB r14,r9, r14, LSL #1 ; r14= t[4]-t[5] |
michael@0 | 706 | MUL r14,r12,r14 ; r14= t2[5]<<16 = C4S4*(t[4]-t[5]) |
michael@0 | 707 | ; 7-6 butterfly |
michael@0 | 708 | ADD r8, r8, r10 ; r8 = t2[7] = t[7]+t[6] |
michael@0 | 709 | SUB r10,r8, r10, LSL #1 ; r10= t[7]-t[6] |
michael@0 | 710 | MUL r10,r12,r10 ; r10= t2[6]<<16 = C4S4*(t[7]+t[6]) |
michael@0 | 711 | ; r2=t[0]<<16 r3=t[2] r4=t[3] r6=t[1]<<16 r8=t2[7] r9=t2[4] |
michael@0 | 712 | ; r10=t2[6]<<16 r12=C4S4 r14=t2[5]<<16 |
michael@0 | 713 | ; Stage 3 |
michael@0 | 714 | ADD r2, r2, #8<<16 ; r2 = t[0]+8<<16 |
michael@0 | 715 | ADD r6, r6, #8<<16 ; r6 = t[1]+8<<16 |
michael@0 | 716 | ; 0-3 butterfly |
michael@0 | 717 | ADD r2, r4, r2, ASR #16 ; r2 = t2[0] = t[0] + t[3] + 8 |
michael@0 | 718 | SUB r4, r2, r4, LSL #1 ; r4 = t2[3] = t[0] - t[3] + 8 |
michael@0 | 719 | ; 1-2 butterfly |
michael@0 | 720 | ADD r6, r3, r6, ASR #16 ; r6 = t2[1] = t[1] + t[2] + 8 |
michael@0 | 721 | SUB r3, r6, r3, LSL #1 ; r3 = t2[2] = t[1] - t[2] + 8 |
michael@0 | 722 | ; 6-5 butterfly |
michael@0 | 723 | MOV r14,r14,ASR #16 ; r14= t2[5] |
michael@0 | 724 | ADD r10,r14,r10,ASR #16 ; r10= t3[6] = t[6] + t[5] |
michael@0 | 725 | SUB r14,r10,r14,LSL #1 ; r14= t3[5] = t[6] - t[5] |
michael@0 | 726 | ; r2=t2[0] r3=t2[2] r4=t2[3] r6=t2[1] r8=t2[7] r9=t2[4] |
michael@0 | 727 | ; r10=t3[6] r14=t3[5] |
michael@0 | 728 | ; Stage 4 |
michael@0 | 729 | ADD r2, r2, r8 ; r2 = t[0] + t[7] + 8 |
michael@0 | 730 | ADD r6, r6, r10 ; r6 = t[1] + t[6] + 8 |
michael@0 | 731 | ADD r3, r3, r14 ; r3 = t[2] + t[5] + 8 |
michael@0 | 732 | ADD r4, r4, r9 ; r4 = t[3] + t[4] + 8 |
michael@0 | 733 | SUB r8, r2, r8, LSL #1 ; r8 = t[0] - t[7] + 8 |
michael@0 | 734 | SUB r10,r6, r10,LSL #1 ; r10= t[1] - t[6] + 8 |
michael@0 | 735 | SUB r14,r3, r14,LSL #1 ; r14= t[2] - t[5] + 8 |
michael@0 | 736 | SUB r9, r4, r9, LSL #1 ; r9 = t[3] - t[4] + 8 |
michael@0 | 737 | ; TODO: This is wrong. |
michael@0 | 738 | ; The C code truncates to 16 bits by storing to RAM and doing the |
michael@0 | 739 | ; shifts later; we've got an extra 4 bits here. |
michael@0 | 740 | MOV r2, r2, ASR #4 |
michael@0 | 741 | MOV r6, r6, ASR #4 |
michael@0 | 742 | MOV r3, r3, ASR #4 |
michael@0 | 743 | MOV r4, r4, ASR #4 |
michael@0 | 744 | MOV r8, r8, ASR #4 |
michael@0 | 745 | MOV r10,r10,ASR #4 |
michael@0 | 746 | MOV r14,r14,ASR #4 |
michael@0 | 747 | MOV r9, r9, ASR #4 |
michael@0 | 748 | STRH r2, [r0], #2 ; y[0] = t[0]+t[7] |
michael@0 | 749 | STRH r6, [r0, #14] ; y[1] = t[1]+t[6] |
michael@0 | 750 | STRH r3, [r0, #30] ; y[2] = t[2]+t[5] |
michael@0 | 751 | STRH r4, [r0, #46] ; y[3] = t[3]+t[4] |
michael@0 | 752 | STRH r9, [r0, #62] ; y[4] = t[3]-t[4] |
michael@0 | 753 | STRH r14,[r0, #78] ; y[5] = t[2]-t[5] |
michael@0 | 754 | STRH r10,[r0, #94] ; y[6] = t[1]-t[6] |
michael@0 | 755 | STRH r8, [r0, #110] ; y[7] = t[0]-t[7] |
michael@0 | 756 | LDMFD r13!,{r1,PC} |
michael@0 | 757 | ENDP |
michael@0 | 758 | |
michael@0 | 759 | [ OC_ARM_ASM_MEDIA |
michael@0 | 760 | EXPORT oc_idct8x8_1_v6 |
michael@0 | 761 | EXPORT oc_idct8x8_v6 |
michael@0 | 762 | |
michael@0 | 763 | oc_idct8x8_1_v6 PROC |
michael@0 | 764 | ; r0 = ogg_int16_t *_y |
michael@0 | 765 | ; r1 = ogg_uint16_t _dc |
michael@0 | 766 | ORR r2, r1, r1, LSL #16 |
michael@0 | 767 | ORR r3, r1, r1, LSL #16 |
michael@0 | 768 | STRD r2, [r0], #8 |
michael@0 | 769 | STRD r2, [r0], #8 |
michael@0 | 770 | STRD r2, [r0], #8 |
michael@0 | 771 | STRD r2, [r0], #8 |
michael@0 | 772 | STRD r2, [r0], #8 |
michael@0 | 773 | STRD r2, [r0], #8 |
michael@0 | 774 | STRD r2, [r0], #8 |
michael@0 | 775 | STRD r2, [r0], #8 |
michael@0 | 776 | STRD r2, [r0], #8 |
michael@0 | 777 | STRD r2, [r0], #8 |
michael@0 | 778 | STRD r2, [r0], #8 |
michael@0 | 779 | STRD r2, [r0], #8 |
michael@0 | 780 | STRD r2, [r0], #8 |
michael@0 | 781 | STRD r2, [r0], #8 |
michael@0 | 782 | STRD r2, [r0], #8 |
michael@0 | 783 | STRD r2, [r0], #8 |
michael@0 | 784 | MOV PC, r14 |
michael@0 | 785 | ENDP |
michael@0 | 786 | |
michael@0 | 787 | oc_idct8x8_v6 PROC |
michael@0 | 788 | ; r0 = ogg_int16_t *_y |
michael@0 | 789 | ; r1 = ogg_int16_t *_x |
michael@0 | 790 | ; r2 = int _last_zzi |
michael@0 | 791 | CMP r2, #3 |
michael@0 | 792 | BLE oc_idct8x8_3_v6 |
michael@0 | 793 | ;CMP r2, #6 |
michael@0 | 794 | ;BLE oc_idct8x8_6_v6 |
michael@0 | 795 | CMP r2, #10 |
michael@0 | 796 | BLE oc_idct8x8_10_v6 |
michael@0 | 797 | oc_idct8x8_slow_v6 |
michael@0 | 798 | STMFD r13!,{r4-r11,r14} |
michael@0 | 799 | SUB r13,r13,#64*2 |
michael@0 | 800 | ; Row transforms |
michael@0 | 801 | STR r0, [r13,#-4]! |
michael@0 | 802 | ADD r0, r13, #4 ; Write to temp storage. |
michael@0 | 803 | BL idct8_8core_v6 |
michael@0 | 804 | BL idct8_8core_v6 |
michael@0 | 805 | BL idct8_8core_v6 |
michael@0 | 806 | BL idct8_8core_v6 |
michael@0 | 807 | LDR r0, [r13], #4 ; Write to the final destination. |
michael@0 | 808 | ; Clear input data for next block (decoder only). |
michael@0 | 809 | SUB r2, r1, #8*16 |
michael@0 | 810 | CMP r0, r2 |
michael@0 | 811 | MOV r1, r13 ; And read from temp storage. |
michael@0 | 812 | BEQ oc_idct8x8_slow_v6_cols |
michael@0 | 813 | MOV r4, #0 |
michael@0 | 814 | MOV r5, #0 |
michael@0 | 815 | STRD r4, [r2], #8 |
michael@0 | 816 | STRD r4, [r2], #8 |
michael@0 | 817 | STRD r4, [r2], #8 |
michael@0 | 818 | STRD r4, [r2], #8 |
michael@0 | 819 | STRD r4, [r2], #8 |
michael@0 | 820 | STRD r4, [r2], #8 |
michael@0 | 821 | STRD r4, [r2], #8 |
michael@0 | 822 | STRD r4, [r2], #8 |
michael@0 | 823 | STRD r4, [r2], #8 |
michael@0 | 824 | STRD r4, [r2], #8 |
michael@0 | 825 | STRD r4, [r2], #8 |
michael@0 | 826 | STRD r4, [r2], #8 |
michael@0 | 827 | STRD r4, [r2], #8 |
michael@0 | 828 | STRD r4, [r2], #8 |
michael@0 | 829 | STRD r4, [r2], #8 |
michael@0 | 830 | STRD r4, [r2], #8 |
michael@0 | 831 | oc_idct8x8_slow_v6_cols |
michael@0 | 832 | ; Column transforms |
michael@0 | 833 | BL idct8_8core_down_v6 |
michael@0 | 834 | BL idct8_8core_down_v6 |
michael@0 | 835 | BL idct8_8core_down_v6 |
michael@0 | 836 | BL idct8_8core_down_v6 |
michael@0 | 837 | ADD r13,r13,#64*2 |
michael@0 | 838 | LDMFD r13!,{r4-r11,PC} |
michael@0 | 839 | ENDP |
michael@0 | 840 | |
michael@0 | 841 | oc_idct8x8_10_v6 PROC |
michael@0 | 842 | STMFD r13!,{r4-r11,r14} |
michael@0 | 843 | SUB r13,r13,#64*2+4 |
michael@0 | 844 | ; Row transforms |
michael@0 | 845 | MOV r2, r13 |
michael@0 | 846 | STR r0, [r13,#-4]! |
michael@0 | 847 | AND r0, r2, #4 ; Align the stack. |
michael@0 | 848 | ADD r0, r0, r2 ; Write to temp storage. |
michael@0 | 849 | BL idct4_3core_v6 |
michael@0 | 850 | BL idct2_1core_v6 |
michael@0 | 851 | LDR r0, [r13], #4 ; Write to the final destination. |
michael@0 | 852 | ; Clear input data for next block (decoder only). |
michael@0 | 853 | SUB r2, r1, #4*16 |
michael@0 | 854 | CMP r0, r2 |
michael@0 | 855 | AND r1, r13,#4 ; Align the stack. |
michael@0 | 856 | BEQ oc_idct8x8_10_v6_cols |
michael@0 | 857 | MOV r4, #0 |
michael@0 | 858 | MOV r5, #0 |
michael@0 | 859 | STRD r4, [r2] |
michael@0 | 860 | STRD r4, [r2,#16] |
michael@0 | 861 | STR r4, [r2,#32] |
michael@0 | 862 | STR r4, [r2,#48] |
michael@0 | 863 | oc_idct8x8_10_v6_cols |
michael@0 | 864 | ; Column transforms |
michael@0 | 865 | ADD r1, r1, r13 ; And read from temp storage. |
michael@0 | 866 | BL idct4_4core_down_v6 |
michael@0 | 867 | BL idct4_4core_down_v6 |
michael@0 | 868 | BL idct4_4core_down_v6 |
michael@0 | 869 | BL idct4_4core_down_v6 |
michael@0 | 870 | ADD r13,r13,#64*2+4 |
michael@0 | 871 | LDMFD r13!,{r4-r11,PC} |
michael@0 | 872 | ENDP |
michael@0 | 873 | |
michael@0 | 874 | oc_idct8x8_3_v6 PROC |
michael@0 | 875 | STMFD r13!,{r4-r8,r14} |
michael@0 | 876 | SUB r13,r13,#64*2 |
michael@0 | 877 | ; Row transforms |
michael@0 | 878 | MOV r8, r0 |
michael@0 | 879 | MOV r0, r13 ; Write to temp storage. |
michael@0 | 880 | BL idct2_1core_v6 |
michael@0 | 881 | ; Clear input data for next block (decoder only). |
michael@0 | 882 | SUB r0, r1, #2*16 |
michael@0 | 883 | CMP r0, r8 |
michael@0 | 884 | MOV r1, r13 ; Read from temp storage. |
michael@0 | 885 | MOVNE r4, #0 |
michael@0 | 886 | STRNE r4, [r0] |
michael@0 | 887 | STRNE r4, [r0,#16] |
michael@0 | 888 | MOVNE r0, r8 ; Write to the final destination. |
michael@0 | 889 | ; Column transforms |
michael@0 | 890 | BL idct2_2core_down_v6 |
michael@0 | 891 | BL idct2_2core_down_v6 |
michael@0 | 892 | BL idct2_2core_down_v6 |
michael@0 | 893 | BL idct2_2core_down_v6 |
michael@0 | 894 | ADD r13,r13,#64*2 |
michael@0 | 895 | LDMFD r13!,{r4-r8,PC} |
michael@0 | 896 | ENDP |
michael@0 | 897 | |
michael@0 | 898 | idct2_1core_v6 PROC |
michael@0 | 899 | ; r0 = ogg_int16_t *_y (destination) |
michael@0 | 900 | ; r1 = const ogg_int16_t *_x (source) |
michael@0 | 901 | ; Stage 1: |
michael@0 | 902 | LDR r2, [r1], #16 ; r2 = <x[0,1]|x[0,0]> |
michael@0 | 903 | LDR r3, OC_C4S4 |
michael@0 | 904 | LDRSH r6, [r1], #16 ; r6 = x[1,0] |
michael@0 | 905 | SMULWB r12,r3, r2 ; r12= t[0,0]=OC_C4S4*x[0,0]>>16 |
michael@0 | 906 | LDRD r4, OC_C7S1 ; r4 = OC_C7S1; r5 = OC_C1S7 |
michael@0 | 907 | SMULWB r6, r3, r6 ; r6 = t[1,0]=OC_C4S4*x[1,0]>>16 |
michael@0 | 908 | SMULWT r4, r4, r2 ; r4 = t[0,4]=OC_C7S1*x[0,1]>>16 |
michael@0 | 909 | SMULWT r7, r5, r2 ; r7 = t[0,7]=OC_C1S7*x[0,1]>>16 |
michael@0 | 910 | ; Stage 2: |
michael@0 | 911 | SMULWB r5, r3, r4 ; r5 = t[0,5]=OC_C4S4*t[0,4]>>16 |
michael@0 | 912 | PKHBT r12,r12,r6, LSL #16 ; r12= <t[1,0]|t[0,0]> |
michael@0 | 913 | SMULWB r6, r3, r7 ; r6 = t[0,6]=OC_C4S4*t[0,7]>>16 |
michael@0 | 914 | PKHBT r7, r7, r3 ; r7 = <0|t[0,7]> |
michael@0 | 915 | ; Stage 3: |
michael@0 | 916 | PKHBT r5, r6, r5, LSL #16 ; r5 = <t[0,5]|t[0,6]> |
michael@0 | 917 | PKHBT r4, r4, r3 ; r4 = <0|t[0,4]> |
michael@0 | 918 | SASX r5, r5, r5 ; r5 = <t[0,6]+t[0,5]|t[0,6]-t[0,5]> |
michael@0 | 919 | ; Stage 4: |
michael@0 | 920 | PKHTB r6, r3, r5, ASR #16 ; r6 = <0|t[0,6]> |
michael@0 | 921 | PKHBT r5, r5, r3 ; r5 = <0|t[0,5]> |
michael@0 | 922 | SADD16 r3, r12,r7 ; r3 = t[0]+t[7] |
michael@0 | 923 | STR r3, [r0], #4 ; y[0<<3] = t[0]+t[7] |
michael@0 | 924 | SADD16 r3, r12,r6 ; r3 = t[0]+t[6] |
michael@0 | 925 | STR r3, [r0, #12] ; y[1<<3] = t[0]+t[6] |
michael@0 | 926 | SADD16 r3, r12,r5 ; r3 = t[0]+t[5] |
michael@0 | 927 | STR r3, [r0, #28] ; y[2<<3] = t[0]+t[5] |
michael@0 | 928 | SADD16 r3, r12,r4 ; r3 = t[0]+t[4] |
michael@0 | 929 | STR r3, [r0, #44] ; y[3<<3] = t[0]+t[4] |
michael@0 | 930 | SSUB16 r4, r12,r4 ; r4 = t[0]-t[4] |
michael@0 | 931 | STR r4, [r0, #60] ; y[4<<3] = t[0]-t[4] |
michael@0 | 932 | SSUB16 r5, r12,r5 ; r5 = t[0]-t[5] |
michael@0 | 933 | STR r5, [r0, #76] ; y[5<<3] = t[0]-t[5] |
michael@0 | 934 | SSUB16 r6, r12,r6 ; r6 = t[0]-t[6] |
michael@0 | 935 | STR r6, [r0, #92] ; y[6<<3] = t[0]-t[6] |
michael@0 | 936 | SSUB16 r7, r12,r7 ; r7 = t[0]-t[7] |
michael@0 | 937 | STR r7, [r0, #108] ; y[7<<3] = t[0]-t[7] |
michael@0 | 938 | MOV PC,r14 |
michael@0 | 939 | ENDP |
michael@0 | 940 | ] |
michael@0 | 941 | |
michael@0 | 942 | ALIGN 8 |
michael@0 | 943 | OC_C7S1 |
michael@0 | 944 | DCD 12785 ; 31F1 |
michael@0 | 945 | OC_C1S7 |
michael@0 | 946 | DCD 64277 ; FB15 |
michael@0 | 947 | OC_C6S2 |
michael@0 | 948 | DCD 25080 ; 61F8 |
michael@0 | 949 | OC_C2S6 |
michael@0 | 950 | DCD 60547 ; EC83 |
michael@0 | 951 | OC_C5S3 |
michael@0 | 952 | DCD 36410 ; 8E3A |
michael@0 | 953 | OC_C3S5 |
michael@0 | 954 | DCD 54491 ; D4DB |
michael@0 | 955 | OC_C4S4 |
michael@0 | 956 | DCD 46341 ; B505 |
michael@0 | 957 | |
michael@0 | 958 | [ OC_ARM_ASM_MEDIA |
michael@0 | 959 | idct2_2core_down_v6 PROC |
michael@0 | 960 | ; r0 = ogg_int16_t *_y (destination) |
michael@0 | 961 | ; r1 = const ogg_int16_t *_x (source) |
michael@0 | 962 | ; Stage 1: |
michael@0 | 963 | LDR r2, [r1], #16 ; r2 = <x[0,1]|x[0,0]> |
michael@0 | 964 | LDR r3, OC_C4S4 |
michael@0 | 965 | MOV r7 ,#8 ; r7 = 8 |
michael@0 | 966 | LDR r6, [r1], #16 ; r6 = <x[1,1]|x[1,0]> |
michael@0 | 967 | SMLAWB r12,r3, r2, r7 ; r12= (t[0,0]=OC_C4S4*x[0,0]>>16)+8 |
michael@0 | 968 | LDRD r4, OC_C7S1 ; r4 = OC_C7S1; r5 = OC_C1S7 |
michael@0 | 969 | SMLAWB r7, r3, r6, r7 ; r7 = (t[1,0]=OC_C4S4*x[1,0]>>16)+8 |
michael@0 | 970 | SMULWT r5, r5, r2 ; r2 = t[0,7]=OC_C1S7*x[0,1]>>16 |
michael@0 | 971 | PKHBT r12,r12,r7, LSL #16 ; r12= <t[1,0]+8|t[0,0]+8> |
michael@0 | 972 | SMULWT r4, r4, r2 ; r4 = t[0,4]=OC_C7S1*x[0,1]>>16 |
michael@0 | 973 | ; Here we cheat: row 1 had just a DC, so x[0,1]==x[1,1] by definition. |
michael@0 | 974 | PKHBT r7, r5, r5, LSL #16 ; r7 = <t[0,7]|t[0,7]> |
michael@0 | 975 | ; Stage 2: |
michael@0 | 976 | SMULWB r6, r3, r7 ; r6 = t[0,6]=OC_C4S4*t[0,7]>>16 |
michael@0 | 977 | PKHBT r4, r4, r4, LSL #16 ; r4 = <t[0,4]|t[0,4]> |
michael@0 | 978 | SMULWT r2, r3, r7 ; r2 = t[1,6]=OC_C4S4*t[1,7]>>16 |
michael@0 | 979 | SMULWB r5, r3, r4 ; r5 = t[0,5]=OC_C4S4*t[0,4]>>16 |
michael@0 | 980 | PKHBT r6, r6, r2, LSL #16 ; r6 = <t[1,6]|t[0,6]> |
michael@0 | 981 | SMULWT r2, r3, r4 ; r2 = t[1,5]=OC_C4S4*t[1,4]>>16 |
michael@0 | 982 | PKHBT r2, r5, r2, LSL #16 ; r2 = <t[1,5]|t[0,5]> |
michael@0 | 983 | ; Stage 3: |
michael@0 | 984 | SSUB16 r5, r6, r2 ; r5 = <t[1,6]-t[1,5]|t[0,6]-t[0,5]> |
michael@0 | 985 | SADD16 r6, r6, r2 ; r6 = <t[1,6]+t[1,5]|t[0,6]+t[0,5]> |
michael@0 | 986 | ; Stage 4: |
michael@0 | 987 | SADD16 r2, r12,r7 ; r2 = t[0]+t[7]+8 |
michael@0 | 988 | MOV r3, r2, ASR #4 |
michael@0 | 989 | MOV r2, r2, LSL #16 |
michael@0 | 990 | PKHTB r3, r3, r2, ASR #20 ; r3 = t[0]+t[7]+8>>4 |
michael@0 | 991 | STR r3, [r0], #4 ; y[0<<3] = t[0]+t[7]+8>>4 |
michael@0 | 992 | SADD16 r2, r12,r6 ; r2 = t[0]+t[6]+8 |
michael@0 | 993 | MOV r3, r2, ASR #4 |
michael@0 | 994 | MOV r2, r2, LSL #16 |
michael@0 | 995 | PKHTB r3, r3, r2, ASR #20 ; r3 = t[0]+t[6]+8>>4 |
michael@0 | 996 | STR r3, [r0, #12] ; y[1<<3] = t[0]+t[6]+8>>4 |
michael@0 | 997 | SADD16 r2, r12,r5 ; r2 = t[0]+t[5]+8 |
michael@0 | 998 | MOV r3, r2, ASR #4 |
michael@0 | 999 | MOV r2, r2, LSL #16 |
michael@0 | 1000 | PKHTB r3, r3, r2, ASR #20 ; r3 = t[0]+t[5]+8>>4 |
michael@0 | 1001 | STR r3, [r0, #28] ; y[2<<3] = t[0]+t[5]+8>>4 |
michael@0 | 1002 | SADD16 r2, r12,r4 ; r2 = t[0]+t[4]+8 |
michael@0 | 1003 | MOV r3, r2, ASR #4 |
michael@0 | 1004 | MOV r2, r2, LSL #16 |
michael@0 | 1005 | PKHTB r3, r3, r2, ASR #20 ; r3 = t[0]+t[4]+8>>4 |
michael@0 | 1006 | STR r3, [r0, #44] ; y[3<<3] = t[0]+t[4]+8>>4 |
michael@0 | 1007 | SSUB16 r4, r12,r4 ; r4 = t[0]-t[4]+8 |
michael@0 | 1008 | MOV r3, r4, ASR #4 |
michael@0 | 1009 | MOV r4, r4, LSL #16 |
michael@0 | 1010 | PKHTB r3, r3, r4, ASR #20 ; r3 = t[0]-t[4]+8>>4 |
michael@0 | 1011 | STR r3, [r0, #60] ; y[4<<3] = t[0]-t[4]+8>>4 |
michael@0 | 1012 | SSUB16 r5, r12,r5 ; r5 = t[0]-t[5]+8 |
michael@0 | 1013 | MOV r3, r5, ASR #4 |
michael@0 | 1014 | MOV r5, r5, LSL #16 |
michael@0 | 1015 | PKHTB r3, r3, r5, ASR #20 ; r3 = t[0]-t[5]+8>>4 |
michael@0 | 1016 | STR r3, [r0, #76] ; y[5<<3] = t[0]-t[5]+8>>4 |
michael@0 | 1017 | SSUB16 r6, r12,r6 ; r6 = t[0]-t[6]+8 |
michael@0 | 1018 | MOV r3, r6, ASR #4 |
michael@0 | 1019 | MOV r6, r6, LSL #16 |
michael@0 | 1020 | PKHTB r3, r3, r6, ASR #20 ; r3 = t[0]-t[6]+8>>4 |
michael@0 | 1021 | STR r3, [r0, #92] ; y[6<<3] = t[0]-t[6]+8>>4 |
michael@0 | 1022 | SSUB16 r7, r12,r7 ; r7 = t[0]-t[7]+8 |
michael@0 | 1023 | MOV r3, r7, ASR #4 |
michael@0 | 1024 | MOV r7, r7, LSL #16 |
michael@0 | 1025 | PKHTB r3, r3, r7, ASR #20 ; r3 = t[0]-t[7]+8>>4 |
michael@0 | 1026 | STR r3, [r0, #108] ; y[7<<3] = t[0]-t[7]+8>>4 |
michael@0 | 1027 | MOV PC,r14 |
michael@0 | 1028 | ENDP |
michael@0 | 1029 | |
michael@0 | 1030 | ; In theory this should save ~75 cycles over oc_idct8x8_10, more than enough to |
michael@0 | 1031 | ; pay for increased branch mis-prediction to get here, but in practice it |
michael@0 | 1032 | ; doesn't seem to slow anything down to take it out, and it's less code this |
michael@0 | 1033 | ; way. |
michael@0 | 1034 | [ 0 |
michael@0 | 1035 | oc_idct8x8_6_v6 PROC |
michael@0 | 1036 | STMFD r13!,{r4-r8,r10,r11,r14} |
michael@0 | 1037 | SUB r13,r13,#64*2+4 |
michael@0 | 1038 | ; Row transforms |
michael@0 | 1039 | MOV r8, r0 |
michael@0 | 1040 | AND r0, r13,#4 ; Align the stack. |
michael@0 | 1041 | ADD r0, r0, r13 ; Write to temp storage. |
michael@0 | 1042 | BL idct3_2core_v6 |
michael@0 | 1043 | BL idct1core_v6 |
michael@0 | 1044 | ; Clear input data for next block (decoder only). |
michael@0 | 1045 | SUB r0, r1, #3*16 |
michael@0 | 1046 | CMP r0, r8 |
michael@0 | 1047 | AND r1, r13,#4 ; Align the stack. |
michael@0 | 1048 | BEQ oc_idct8x8_6_v6_cols |
michael@0 | 1049 | MOV r4, #0 |
michael@0 | 1050 | MOV r5, #0 |
michael@0 | 1051 | STRD r4, [r0] |
michael@0 | 1052 | STR r4, [r0,#16] |
michael@0 | 1053 | STR r4, [r0,#32] |
michael@0 | 1054 | MOV r0, r8 ; Write to the final destination. |
michael@0 | 1055 | oc_idct8x8_6_v6_cols |
michael@0 | 1056 | ; Column transforms |
michael@0 | 1057 | ADD r1, r1, r13 ; And read from temp storage. |
michael@0 | 1058 | BL idct3_3core_down_v6 |
michael@0 | 1059 | BL idct3_3core_down_v6 |
michael@0 | 1060 | BL idct3_3core_down_v6 |
michael@0 | 1061 | BL idct3_3core_down_v6 |
michael@0 | 1062 | ADD r13,r13,#64*2+4 |
michael@0 | 1063 | LDMFD r13!,{r4-r8,r10,r11,PC} |
michael@0 | 1064 | ENDP |
michael@0 | 1065 | |
michael@0 | 1066 | idct1core_v6 PROC |
michael@0 | 1067 | ; r0 = ogg_int16_t *_y (destination) |
michael@0 | 1068 | ; r1 = const ogg_int16_t *_x (source) |
michael@0 | 1069 | LDRSH r3, [r1], #16 |
michael@0 | 1070 | MOV r12,#0x05 |
michael@0 | 1071 | ORR r12,r12,#0xB500 |
michael@0 | 1072 | MUL r3, r12, r3 |
michael@0 | 1073 | ; Stall ? |
michael@0 | 1074 | MOV r3, r3, ASR #16 |
michael@0 | 1075 | ; Don't need to actually store the odd lines; they won't be read. |
michael@0 | 1076 | STRH r3, [r0], #2 |
michael@0 | 1077 | STRH r3, [r0, #30] |
michael@0 | 1078 | STRH r3, [r0, #62] |
michael@0 | 1079 | STRH r3, [r0, #94] |
michael@0 | 1080 | MOV PC,R14 |
michael@0 | 1081 | ENDP |
michael@0 | 1082 | |
michael@0 | 1083 | idct3_2core_v6 PROC |
michael@0 | 1084 | ; r0 = ogg_int16_t *_y (destination) |
michael@0 | 1085 | ; r1 = const ogg_int16_t *_x (source) |
michael@0 | 1086 | ; Stage 1: |
michael@0 | 1087 | LDRD r4, [r1], #16 ; r4 = <x[0,1]|x[0,0]>; r5 = <*|x[0,2]> |
michael@0 | 1088 | LDRD r10,OC_C6S2_3_v6 ; r10= OC_C6S2; r11= OC_C2S6 |
michael@0 | 1089 | ; Stall |
michael@0 | 1090 | SMULWB r3, r11,r5 ; r3 = t[0,3]=OC_C2S6*x[0,2]>>16 |
michael@0 | 1091 | LDR r11,OC_C4S4 |
michael@0 | 1092 | SMULWB r2, r10,r5 ; r2 = t[0,2]=OC_C6S2*x[0,2]>>16 |
michael@0 | 1093 | LDR r5, [r1], #16 ; r5 = <x[1,1]|x[1,0]> |
michael@0 | 1094 | SMULWB r12,r11,r4 ; r12= (t[0,0]=OC_C4S4*x[0,0]>>16) |
michael@0 | 1095 | LDRD r6, OC_C7S1_3_v6 ; r6 = OC_C7S1; r7 = OC_C1S7 |
michael@0 | 1096 | SMULWB r10,r11,r5 ; r10= (t[1,0]=OC_C4S4*x[1,0]>>16) |
michael@0 | 1097 | PKHBT r12,r12,r10,LSL #16 ; r12= <t[1,0]|t[0,0]> |
michael@0 | 1098 | SMULWT r10,r7, r5 ; r10= t[1,7]=OC_C1S7*x[1,1]>>16 |
michael@0 | 1099 | PKHBT r2, r2, r11 ; r2 = <0|t[0,2]> |
michael@0 | 1100 | SMULWT r7, r7, r4 ; r7 = t[0,7]=OC_C1S7*x[0,1]>>16 |
michael@0 | 1101 | PKHBT r3, r3, r11 ; r3 = <0|t[0,3]> |
michael@0 | 1102 | SMULWT r5, r6, r5 ; r10= t[1,4]=OC_C7S1*x[1,1]>>16 |
michael@0 | 1103 | PKHBT r7, r7, r10,LSL #16 ; r7 = <t[1,7]|t[0,7]> |
michael@0 | 1104 | SMULWT r4, r6, r4 ; r4 = t[0,4]=OC_C7S1*x[0,1]>>16 |
michael@0 | 1105 | ; Stage 2: |
michael@0 | 1106 | SMULWB r6, r11,r7 ; r6 = t[0,6]=OC_C4S4*t[0,7]>>16 |
michael@0 | 1107 | PKHBT r4, r4, r5, LSL #16 ; r4 = <t[1,4]|t[0,4]> |
michael@0 | 1108 | SMULWT r10,r11,r7 ; r10= t[1,6]=OC_C4S4*t[1,7]>>16 |
michael@0 | 1109 | SMULWB r5, r11,r4 ; r5 = t[0,5]=OC_C4S4*t[0,4]>>16 |
michael@0 | 1110 | PKHBT r6, r6, r10,LSL #16 ; r6 = <t[1,6]|t[0,6]> |
michael@0 | 1111 | SMULWT r10,r11,r4 ; r10= t[1,5]=OC_C4S4*t[1,4]>>16 |
michael@0 | 1112 | ; Stage 3: |
michael@0 | 1113 | B idct4_3core_stage3_v6 |
michael@0 | 1114 | ENDP |
michael@0 | 1115 | |
michael@0 | 1116 | ; Another copy so the LDRD offsets are less than +/- 255. |
michael@0 | 1117 | ALIGN 8 |
michael@0 | 1118 | OC_C7S1_3_v6 |
michael@0 | 1119 | DCD 12785 ; 31F1 |
michael@0 | 1120 | OC_C1S7_3_v6 |
michael@0 | 1121 | DCD 64277 ; FB15 |
michael@0 | 1122 | OC_C6S2_3_v6 |
michael@0 | 1123 | DCD 25080 ; 61F8 |
michael@0 | 1124 | OC_C2S6_3_v6 |
michael@0 | 1125 | DCD 60547 ; EC83 |
michael@0 | 1126 | |
michael@0 | 1127 | idct3_3core_down_v6 PROC |
michael@0 | 1128 | ; r0 = ogg_int16_t *_y (destination) |
michael@0 | 1129 | ; r1 = const ogg_int16_t *_x (source) |
michael@0 | 1130 | ; Stage 1: |
michael@0 | 1131 | LDRD r10,[r1], #16 ; r10= <x[0,1]|x[0,0]>; r11= <??|x[0,2]> |
michael@0 | 1132 | LDRD r6, OC_C6S2_3_v6 ; r6 = OC_C6S2; r7 = OC_C2S6 |
michael@0 | 1133 | LDR r4, [r1], #16 ; r4 = <x[1,1]|x[1,0]> |
michael@0 | 1134 | SMULWB r3, r7, r11 ; r3 = t[0,3]=OC_C2S6*x[0,2]>>16 |
michael@0 | 1135 | MOV r7,#8 |
michael@0 | 1136 | SMULWB r2, r6, r11 ; r2 = t[0,2]=OC_C6S2*x[0,2]>>16 |
michael@0 | 1137 | LDR r11,OC_C4S4 |
michael@0 | 1138 | SMLAWB r12,r11,r10,r7 ; r12= t[0,0]+8=(OC_C4S4*x[0,0]>>16)+8 |
michael@0 | 1139 | ; Here we cheat: row 2 had just a DC, so x[0,2]==x[1,2] by definition. |
michael@0 | 1140 | PKHBT r3, r3, r3, LSL #16 ; r3 = <t[0,3]|t[0,3]> |
michael@0 | 1141 | SMLAWB r5, r11,r4, r7 ; r5 = t[1,0]+8=(OC_C4S4*x[1,0]>>16)+8 |
michael@0 | 1142 | PKHBT r2, r2, r2, LSL #16 ; r2 = <t[0,2]|t[0,2]> |
michael@0 | 1143 | LDRD r6, OC_C7S1_3_v6 ; r6 = OC_C7S1; r7 = OC_C1S7 |
michael@0 | 1144 | PKHBT r12,r12,r5, LSL #16 ; r12= <t[1,0]+8|t[0,0]+8> |
michael@0 | 1145 | SMULWT r5, r7, r4 ; r5 = t[1,7]=OC_C1S7*x[1,1]>>16 |
michael@0 | 1146 | SMULWT r7, r7, r10 ; r7 = t[0,7]=OC_C1S7*x[0,1]>>16 |
michael@0 | 1147 | SMULWT r10,r6, r10 ; r10= t[0,4]=OC_C7S1*x[0,1]>>16 |
michael@0 | 1148 | PKHBT r7, r7, r5, LSL #16 ; r7 = <t[1,7]|t[0,7]> |
michael@0 | 1149 | SMULWT r4, r6, r4 ; r4 = t[1,4]=OC_C7S1*x[1,1]>>16 |
michael@0 | 1150 | ; Stage 2: |
michael@0 | 1151 | SMULWB r6, r11,r7 ; r6 = t[0,6]=OC_C4S4*t[0,7]>>16 |
michael@0 | 1152 | PKHBT r4, r10,r4, LSL #16 ; r4 = <t[1,4]|t[0,4]> |
michael@0 | 1153 | SMULWT r10,r11,r7 ; r10= t[1,6]=OC_C4S4*t[1,7]>>16 |
michael@0 | 1154 | SMULWB r5, r11,r4 ; r5 = t[0,5]=OC_C4S4*t[0,4]>>16 |
michael@0 | 1155 | PKHBT r6, r6, r10,LSL #16 ; r6 = <t[1,6]|t[0,6]> |
michael@0 | 1156 | SMULWT r10,r11,r4 ; r10= t[1,5]=OC_C4S4*t[1,4]>>16 |
michael@0 | 1157 | ; Stage 3: |
michael@0 | 1158 | B idct4_4core_down_stage3_v6 |
michael@0 | 1159 | ENDP |
michael@0 | 1160 | ] |
michael@0 | 1161 | |
michael@0 | 1162 | idct4_3core_v6 PROC |
michael@0 | 1163 | ; r0 = ogg_int16_t *_y (destination) |
michael@0 | 1164 | ; r1 = const ogg_int16_t *_x (source) |
michael@0 | 1165 | ; Stage 1: |
michael@0 | 1166 | LDRD r10,[r1], #16 ; r10= <x[0,1]|x[0,0]>; r11= <x[0,3]|x[0,2]> |
michael@0 | 1167 | LDRD r2, OC_C5S3_4_v6 ; r2 = OC_C5S3; r3 = OC_C3S5 |
michael@0 | 1168 | LDRD r4, [r1], #16 ; r4 = <x[1,1]|x[1,0]>; r5 = <??|x[1,2]> |
michael@0 | 1169 | SMULWT r9, r3, r11 ; r9 = t[0,6]=OC_C3S5*x[0,3]>>16 |
michael@0 | 1170 | SMULWT r8, r2, r11 ; r8 = -t[0,5]=OC_C5S3*x[0,3]>>16 |
michael@0 | 1171 | PKHBT r9, r9, r2 ; r9 = <0|t[0,6]> |
michael@0 | 1172 | LDRD r6, OC_C6S2_4_v6 ; r6 = OC_C6S2; r7 = OC_C2S6 |
michael@0 | 1173 | PKHBT r8, r8, r2 ; r9 = <0|-t[0,5]> |
michael@0 | 1174 | SMULWB r3, r7, r11 ; r3 = t[0,3]=OC_C2S6*x[0,2]>>16 |
michael@0 | 1175 | SMULWB r2, r6, r11 ; r2 = t[0,2]=OC_C6S2*x[0,2]>>16 |
michael@0 | 1176 | LDR r11,OC_C4S4 |
michael@0 | 1177 | SMULWB r12,r7, r5 ; r12= t[1,3]=OC_C2S6*x[1,2]>>16 |
michael@0 | 1178 | SMULWB r5, r6, r5 ; r5 = t[1,2]=OC_C6S2*x[1,2]>>16 |
michael@0 | 1179 | PKHBT r3, r3, r12,LSL #16 ; r3 = <t[1,3]|t[0,3]> |
michael@0 | 1180 | SMULWB r12,r11,r10 ; r12= t[0,0]=OC_C4S4*x[0,0]>>16 |
michael@0 | 1181 | PKHBT r2, r2, r5, LSL #16 ; r2 = <t[1,2]|t[0,2]> |
michael@0 | 1182 | SMULWB r5, r11,r4 ; r5 = t[1,0]=OC_C4S4*x[1,0]>>16 |
michael@0 | 1183 | LDRD r6, OC_C7S1_4_v6 ; r6 = OC_C7S1; r7 = OC_C1S7 |
michael@0 | 1184 | PKHBT r12,r12,r5, LSL #16 ; r12= <t[1,0]|t[0,0]> |
michael@0 | 1185 | SMULWT r5, r7, r4 ; r5 = t[1,7]=OC_C1S7*x[1,1]>>16 |
michael@0 | 1186 | SMULWT r7, r7, r10 ; r7 = t[0,7]=OC_C1S7*x[0,1]>>16 |
michael@0 | 1187 | SMULWT r10,r6, r10 ; r10= t[0,4]=OC_C7S1*x[0,1]>>16 |
michael@0 | 1188 | PKHBT r7, r7, r5, LSL #16 ; r7 = <t[1,7]|t[0,7]> |
michael@0 | 1189 | SMULWT r4, r6, r4 ; r4 = t[1,4]=OC_C7S1*x[1,1]>>16 |
michael@0 | 1190 | ; Stage 2: |
michael@0 | 1191 | SSUB16 r6, r7, r9 ; r6 = t[7]-t[6] |
michael@0 | 1192 | PKHBT r4, r10,r4, LSL #16 ; r4 = <t[1,4]|t[0,4]> |
michael@0 | 1193 | SADD16 r7, r7, r9 ; r7 = t[7]=t[7]+t[6] |
michael@0 | 1194 | SMULWT r9, r11,r6 ; r9 = t[1,6]=OC_C4S4*r6T>>16 |
michael@0 | 1195 | SADD16 r5, r4, r8 ; r5 = t[4]-t[5] |
michael@0 | 1196 | SMULWB r6, r11,r6 ; r6 = t[0,6]=OC_C4S4*r6B>>16 |
michael@0 | 1197 | SSUB16 r4, r4, r8 ; r4 = t[4]=t[4]+t[5] |
michael@0 | 1198 | SMULWT r10,r11,r5 ; r10= t[1,5]=OC_C4S4*r5T>>16 |
michael@0 | 1199 | PKHBT r6, r6, r9, LSL #16 ; r6 = <t[1,6]|t[0,6]> |
michael@0 | 1200 | SMULWB r5, r11,r5 ; r5 = t[0,5]=OC_C4S4*r5B>>16 |
michael@0 | 1201 | ; Stage 3: |
michael@0 | 1202 | idct4_3core_stage3_v6 |
michael@0 | 1203 | SADD16 r11,r12,r2 ; r11= t[1]=t[0]+t[2] |
michael@0 | 1204 | PKHBT r10,r5, r10,LSL #16 ; r10= <t[1,5]|t[0,5]> |
michael@0 | 1205 | SSUB16 r2, r12,r2 ; r2 = t[2]=t[0]-t[2] |
michael@0 | 1206 | idct4_3core_stage3_5_v6 |
michael@0 | 1207 | SSUB16 r5, r6, r10 ; r5 = t[5]'=t[6]-t[5] |
michael@0 | 1208 | SADD16 r6, r6, r10 ; r6 = t[6]=t[6]+t[5] |
michael@0 | 1209 | SADD16 r10,r12,r3 ; r10= t[0]'=t[0]+t[3] |
michael@0 | 1210 | SSUB16 r3, r12,r3 ; r3 = t[3]=t[0]-t[3] |
michael@0 | 1211 | ; Stage 4: |
michael@0 | 1212 | SADD16 r12,r10,r7 ; r12= t[0]+t[7] |
michael@0 | 1213 | STR r12,[r0], #4 ; y[0<<3] = t[0]+t[7] |
michael@0 | 1214 | SADD16 r12,r11,r6 ; r12= t[1]+t[6] |
michael@0 | 1215 | STR r12,[r0, #12] ; y[1<<3] = t[1]+t[6] |
michael@0 | 1216 | SADD16 r12,r2, r5 ; r12= t[2]+t[5] |
michael@0 | 1217 | STR r12,[r0, #28] ; y[2<<3] = t[2]+t[5] |
michael@0 | 1218 | SADD16 r12,r3, r4 ; r12= t[3]+t[4] |
michael@0 | 1219 | STR r12,[r0, #44] ; y[3<<3] = t[3]+t[4] |
michael@0 | 1220 | SSUB16 r4, r3, r4 ; r4 = t[3]-t[4] |
michael@0 | 1221 | STR r4, [r0, #60] ; y[4<<3] = t[3]-t[4] |
michael@0 | 1222 | SSUB16 r5, r2, r5 ; r5 = t[2]-t[5] |
michael@0 | 1223 | STR r5, [r0, #76] ; y[5<<3] = t[2]-t[5] |
michael@0 | 1224 | SSUB16 r6, r11,r6 ; r6 = t[1]-t[6] |
michael@0 | 1225 | STR r6, [r0, #92] ; y[6<<3] = t[1]-t[6] |
michael@0 | 1226 | SSUB16 r7, r10,r7 ; r7 = t[0]-t[7] |
michael@0 | 1227 | STR r7, [r0, #108] ; y[7<<3] = t[0]-t[7] |
michael@0 | 1228 | MOV PC,r14 |
michael@0 | 1229 | ENDP |
michael@0 | 1230 | |
michael@0 | 1231 | ; Another copy so the LDRD offsets are less than +/- 255. |
michael@0 | 1232 | ALIGN 8 |
michael@0 | 1233 | OC_C7S1_4_v6 |
michael@0 | 1234 | DCD 12785 ; 31F1 |
michael@0 | 1235 | OC_C1S7_4_v6 |
michael@0 | 1236 | DCD 64277 ; FB15 |
michael@0 | 1237 | OC_C6S2_4_v6 |
michael@0 | 1238 | DCD 25080 ; 61F8 |
michael@0 | 1239 | OC_C2S6_4_v6 |
michael@0 | 1240 | DCD 60547 ; EC83 |
michael@0 | 1241 | OC_C5S3_4_v6 |
michael@0 | 1242 | DCD 36410 ; 8E3A |
michael@0 | 1243 | OC_C3S5_4_v6 |
michael@0 | 1244 | DCD 54491 ; D4DB |
michael@0 | 1245 | |
michael@0 | 1246 | idct4_4core_down_v6 PROC |
michael@0 | 1247 | ; r0 = ogg_int16_t *_y (destination) |
michael@0 | 1248 | ; r1 = const ogg_int16_t *_x (source) |
michael@0 | 1249 | ; Stage 1: |
michael@0 | 1250 | LDRD r10,[r1], #16 ; r10= <x[0,1]|x[0,0]>; r11= <x[0,3]|x[0,2]> |
michael@0 | 1251 | LDRD r2, OC_C5S3_4_v6 ; r2 = OC_C5S3; r3 = OC_C3S5 |
michael@0 | 1252 | LDRD r4, [r1], #16 ; r4 = <x[1,1]|x[1,0]>; r5 = <x[1,3]|x[1,2]> |
michael@0 | 1253 | SMULWT r9, r3, r11 ; r9 = t[0,6]=OC_C3S5*x[0,3]>>16 |
michael@0 | 1254 | LDRD r6, OC_C6S2_4_v6 ; r6 = OC_C6S2; r7 = OC_C2S6 |
michael@0 | 1255 | SMULWT r8, r2, r11 ; r8 = -t[0,5]=OC_C5S3*x[0,3]>>16 |
michael@0 | 1256 | ; Here we cheat: row 3 had just a DC, so x[0,3]==x[1,3] by definition. |
michael@0 | 1257 | PKHBT r9, r9, r9, LSL #16 ; r9 = <t[0,6]|t[0,6]> |
michael@0 | 1258 | SMULWB r3, r7, r11 ; r3 = t[0,3]=OC_C2S6*x[0,2]>>16 |
michael@0 | 1259 | PKHBT r8, r8, r8, LSL #16 ; r8 = <-t[0,5]|-t[0,5]> |
michael@0 | 1260 | SMULWB r2, r6, r11 ; r2 = t[0,2]=OC_C6S2*x[0,2]>>16 |
michael@0 | 1261 | LDR r11,OC_C4S4 |
michael@0 | 1262 | SMULWB r12,r7, r5 ; r12= t[1,3]=OC_C2S6*x[1,2]>>16 |
michael@0 | 1263 | MOV r7,#8 |
michael@0 | 1264 | SMULWB r5, r6, r5 ; r5 = t[1,2]=OC_C6S2*x[1,2]>>16 |
michael@0 | 1265 | PKHBT r3, r3, r12,LSL #16 ; r3 = <t[1,3]|t[0,3]> |
michael@0 | 1266 | SMLAWB r12,r11,r10,r7 ; r12= t[0,0]+8=(OC_C4S4*x[0,0]>>16)+8 |
michael@0 | 1267 | PKHBT r2, r2, r5, LSL #16 ; r2 = <t[1,2]|t[0,2]> |
michael@0 | 1268 | SMLAWB r5, r11,r4 ,r7 ; r5 = t[1,0]+8=(OC_C4S4*x[1,0]>>16)+8 |
michael@0 | 1269 | LDRD r6, OC_C7S1_4_v6 ; r6 = OC_C7S1; r7 = OC_C1S7 |
michael@0 | 1270 | PKHBT r12,r12,r5, LSL #16 ; r12= <t[1,0]+8|t[0,0]+8> |
michael@0 | 1271 | SMULWT r5, r7, r4 ; r5 = t[1,7]=OC_C1S7*x[1,1]>>16 |
michael@0 | 1272 | SMULWT r7, r7, r10 ; r7 = t[0,7]=OC_C1S7*x[0,1]>>16 |
michael@0 | 1273 | SMULWT r10,r6, r10 ; r10= t[0,4]=OC_C7S1*x[0,1]>>16 |
michael@0 | 1274 | PKHBT r7, r7, r5, LSL #16 ; r7 = <t[1,7]|t[0,7]> |
michael@0 | 1275 | SMULWT r4, r6, r4 ; r4 = t[1,4]=OC_C7S1*x[1,1]>>16 |
michael@0 | 1276 | ; Stage 2: |
michael@0 | 1277 | SSUB16 r6, r7, r9 ; r6 = t[7]-t[6] |
michael@0 | 1278 | PKHBT r4, r10,r4, LSL #16 ; r4 = <t[1,4]|t[0,4]> |
michael@0 | 1279 | SADD16 r7, r7, r9 ; r7 = t[7]=t[7]+t[6] |
michael@0 | 1280 | SMULWT r9, r11,r6 ; r9 = t[1,6]=OC_C4S4*r6T>>16 |
michael@0 | 1281 | SADD16 r5, r4, r8 ; r5 = t[4]-t[5] |
michael@0 | 1282 | SMULWB r6, r11,r6 ; r6 = t[0,6]=OC_C4S4*r6B>>16 |
michael@0 | 1283 | SSUB16 r4, r4, r8 ; r4 = t[4]=t[4]+t[5] |
michael@0 | 1284 | SMULWT r10,r11,r5 ; r10= t[1,5]=OC_C4S4*r5T>>16 |
michael@0 | 1285 | PKHBT r6, r6, r9, LSL #16 ; r6 = <t[1,6]|t[0,6]> |
michael@0 | 1286 | SMULWB r5, r11,r5 ; r5 = t[0,5]=OC_C4S4*r5B>>16 |
michael@0 | 1287 | ; Stage 3: |
michael@0 | 1288 | idct4_4core_down_stage3_v6 |
michael@0 | 1289 | SADD16 r11,r12,r2 ; r11= t[1]+8=t[0]+t[2]+8 |
michael@0 | 1290 | PKHBT r10,r5, r10,LSL #16 ; r10= <t[1,5]|t[0,5]> |
michael@0 | 1291 | SSUB16 r2, r12,r2 ; r2 = t[2]+8=t[0]-t[2]+8 |
michael@0 | 1292 | B idct8_8core_down_stage3_5_v6 |
michael@0 | 1293 | ENDP |
michael@0 | 1294 | |
michael@0 | 1295 | idct8_8core_v6 PROC |
michael@0 | 1296 | STMFD r13!,{r0,r14} |
michael@0 | 1297 | ; Stage 1: |
michael@0 | 1298 | ;5-6 rotation by 3pi/16 |
michael@0 | 1299 | LDRD r10,OC_C5S3_4_v6 ; r10= OC_C5S3, r11= OC_C3S5 |
michael@0 | 1300 | LDR r4, [r1,#8] ; r4 = <x[0,5]|x[0,4]> |
michael@0 | 1301 | LDR r7, [r1,#24] ; r7 = <x[1,5]|x[1,4]> |
michael@0 | 1302 | SMULWT r5, r11,r4 ; r5 = OC_C3S5*x[0,5]>>16 |
michael@0 | 1303 | LDR r0, [r1,#4] ; r0 = <x[0,3]|x[0,2]> |
michael@0 | 1304 | SMULWT r3, r11,r7 ; r3 = OC_C3S5*x[1,5]>>16 |
michael@0 | 1305 | LDR r12,[r1,#20] ; r12= <x[1,3]|x[1,2]> |
michael@0 | 1306 | SMULWT r6, r11,r0 ; r6 = OC_C3S5*x[0,3]>>16 |
michael@0 | 1307 | SMULWT r11,r11,r12 ; r11= OC_C3S5*x[1,3]>>16 |
michael@0 | 1308 | SMLAWT r6, r10,r4, r6 ; r6 = t[0,6]=r6+(OC_C5S3*x[0,5]>>16) |
michael@0 | 1309 | PKHBT r5, r5, r3, LSL #16 ; r5 = <r3|r5> |
michael@0 | 1310 | SMLAWT r11,r10,r7, r11 ; r11= t[1,6]=r11+(OC_C5S3*x[1,5]>>16) |
michael@0 | 1311 | PKHBT r4, r4, r7, LSL #16 ; r4 = <x[1,4]|x[0,4]> |
michael@0 | 1312 | SMULWT r3, r10,r0 ; r3 = OC_C5S3*x[0,3]>>16 |
michael@0 | 1313 | PKHBT r6, r6, r11,LSL #16 ; r6 = <t[1,6]|t[0,6]> |
michael@0 | 1314 | SMULWT r8, r10,r12 ; r8 = OC_C5S3*x[1,3]>>16 |
michael@0 | 1315 | ;2-3 rotation by 6pi/16 |
michael@0 | 1316 | LDRD r10,OC_C6S2_4_v6 ; r10= OC_C6S2, r11= OC_C2S6 |
michael@0 | 1317 | PKHBT r3, r3, r8, LSL #16 ; r3 = <r8|r3> |
michael@0 | 1318 | LDR r8, [r1,#12] ; r8 = <x[0,7]|x[0,6]> |
michael@0 | 1319 | SMULWB r2, r10,r0 ; r2 = OC_C6S2*x[0,2]>>16 |
michael@0 | 1320 | SSUB16 r5, r5, r3 ; r5 = <t[1,5]|t[0,5]> |
michael@0 | 1321 | SMULWB r9, r10,r12 ; r9 = OC_C6S2*x[1,2]>>16 |
michael@0 | 1322 | LDR r7, [r1,#28] ; r7 = <x[1,7]|x[1,6]> |
michael@0 | 1323 | SMULWB r3, r10,r8 ; r3 = OC_C6S2*x[0,6]>>16 |
michael@0 | 1324 | SMULWB r10,r10,r7 ; r10= OC_C6S2*x[1,6]>>16 |
michael@0 | 1325 | PKHBT r2, r2, r9, LSL #16 ; r2 = <r2|r9> |
michael@0 | 1326 | SMLAWB r3, r11,r0, r3 ; r3 = t[0,3]=r3+(OC_C2S6*x[0,2]>>16) |
michael@0 | 1327 | SMLAWB r10,r11,r12,r10 ; r10= t[1,3]=r10+(OC_C2S6*x[1,2]>>16) |
michael@0 | 1328 | SMULWB r9, r11,r8 ; r9 = OC_C2S6*x[0,6]>>16 |
michael@0 | 1329 | PKHBT r3, r3, r10,LSL #16 ; r3 = <t[1,6]|t[0,6]> |
michael@0 | 1330 | SMULWB r12,r11,r7 ; r12= OC_C2S6*x[1,6]>>16 |
michael@0 | 1331 | ;4-7 rotation by 7pi/16 |
michael@0 | 1332 | LDRD r10,OC_C7S1_8_v6 ; r10= OC_C7S1, r11= OC_C1S7 |
michael@0 | 1333 | PKHBT r9, r9, r12,LSL #16 ; r9 = <r9|r12> |
michael@0 | 1334 | LDR r0, [r1],#16 ; r0 = <x[0,1]|x[0,0]> |
michael@0 | 1335 | PKHTB r7, r7, r8, ASR #16 ; r7 = <x[1,7]|x[0,7]> |
michael@0 | 1336 | SSUB16 r2, r2, r9 ; r2 = <t[1,2]|t[0,2]> |
michael@0 | 1337 | SMULWB r9, r10,r7 ; r9 = OC_C7S1*x[0,7]>>16 |
michael@0 | 1338 | LDR r14,[r1],#16 ; r14= <x[1,1]|x[1,0]> |
michael@0 | 1339 | SMULWT r12,r10,r7 ; r12= OC_C7S1*x[1,7]>>16 |
michael@0 | 1340 | SMULWT r8, r10,r0 ; r8 = OC_C7S1*x[0,1]>>16 |
michael@0 | 1341 | SMULWT r10,r10,r14 ; r10= OC_C7S1*x[1,1]>>16 |
michael@0 | 1342 | SMLAWT r9, r11,r0, r9 ; r9 = t[0,7]=r9+(OC_C1S7*x[0,1]>>16) |
michael@0 | 1343 | PKHBT r8, r8, r10,LSL #16 ; r8 = <r12|r8> |
michael@0 | 1344 | SMLAWT r12,r11,r14,r12 ; r12= t[1,7]=r12+(OC_C1S7*x[1,1]>>16) |
michael@0 | 1345 | PKHBT r0, r0, r14,LSL #16 ; r0 = <x[1,0]|x[0,0]> |
michael@0 | 1346 | SMULWB r10,r11,r7 ; r10= OC_C1S7*x[0,6]>>16 |
michael@0 | 1347 | PKHBT r9, r9, r12,LSL #16 ; r9 = <t[1,7]|t[0,7]> |
michael@0 | 1348 | SMULWT r12,r11,r7 ; r12= OC_C1S7*x[1,6]>>16 |
michael@0 | 1349 | ;0-1 butterfly |
michael@0 | 1350 | LDR r11,OC_C4S4 |
michael@0 | 1351 | PKHBT r10,r10,r12,LSL #16 ; r10= <r12|r10> |
michael@0 | 1352 | SADD16 r7, r0, r4 ; r7 = x[0]+x[4] |
michael@0 | 1353 | SSUB16 r10,r8, r10 ; r10= <t[1,4]|t[0,4]> |
michael@0 | 1354 | SSUB16 r4, r0, r4 ; r4 = x[0]-x[4] |
michael@0 | 1355 | SMULWB r8, r11,r7 ; r8 = t[0,0]=OC_C4S4*r7B>>16 |
michael@0 | 1356 | SMULWT r12,r11,r7 ; r12= t[1,0]=OC_C4S4*r7T>>16 |
michael@0 | 1357 | SMULWB r7, r11,r4 ; r7 = t[0,1]=OC_C4S4*r4B>>16 |
michael@0 | 1358 | PKHBT r12,r8, r12,LSL #16 ; r12= <t[1,0]|t[0,0]> |
michael@0 | 1359 | SMULWT r8, r11,r4 ; r8 = t[1,1]=OC_C4S4*r4T>>16 |
michael@0 | 1360 | ; Stage 2: |
michael@0 | 1361 | SADD16 r4, r10,r5 ; r4 = t[4]'=t[4]+t[5] |
michael@0 | 1362 | PKHBT r8, r7, r8, LSL #16 ; r8 = <t[1,0]|t[0,0]> |
michael@0 | 1363 | SSUB16 r5, r10,r5 ; r5 = t[4]-t[5] |
michael@0 | 1364 | SMULWB r10,r11,r5 ; r10= t[0,5]=OC_C4S4*r5B>>16 |
michael@0 | 1365 | SADD16 r7, r9, r6 ; r7 = t[7]'=t[7]+t[6] |
michael@0 | 1366 | SMULWT r5, r11,r5 ; r5 = t[1,5]=OC_C4S4*r5T>>16 |
michael@0 | 1367 | SSUB16 r6, r9, r6 ; r6 = t[7]-t[6] |
michael@0 | 1368 | SMULWB r9, r11,r6 ; r9 = t[0,6]=OC_C4S4*r6B>>16 |
michael@0 | 1369 | PKHBT r10,r10,r5, LSL #16 ; r10= <t[1,5]|t[0,5]> |
michael@0 | 1370 | SMULWT r6, r11,r6 ; r6 = t[1,6]=OC_C4S4*r6T>>16 |
michael@0 | 1371 | ; Stage 3: |
michael@0 | 1372 | SADD16 r11,r8, r2 ; r11= t[1]'=t[1]+t[2] |
michael@0 | 1373 | PKHBT r6, r9, r6, LSL #16 ; r6 = <t[1,6]|t[0,6]> |
michael@0 | 1374 | SSUB16 r2, r8, r2 ; r2 = t[2]=t[1]-t[2] |
michael@0 | 1375 | LDMFD r13!,{r0,r14} |
michael@0 | 1376 | B idct4_3core_stage3_5_v6 |
michael@0 | 1377 | ENDP |
michael@0 | 1378 | |
michael@0 | 1379 | ; Another copy so the LDRD offsets are less than +/- 255. |
michael@0 | 1380 | ALIGN 8 |
michael@0 | 1381 | OC_C7S1_8_v6 |
michael@0 | 1382 | DCD 12785 ; 31F1 |
michael@0 | 1383 | OC_C1S7_8_v6 |
michael@0 | 1384 | DCD 64277 ; FB15 |
michael@0 | 1385 | OC_C6S2_8_v6 |
michael@0 | 1386 | DCD 25080 ; 61F8 |
michael@0 | 1387 | OC_C2S6_8_v6 |
michael@0 | 1388 | DCD 60547 ; EC83 |
michael@0 | 1389 | OC_C5S3_8_v6 |
michael@0 | 1390 | DCD 36410 ; 8E3A |
michael@0 | 1391 | OC_C3S5_8_v6 |
michael@0 | 1392 | DCD 54491 ; D4DB |
michael@0 | 1393 | |
michael@0 | 1394 | idct8_8core_down_v6 PROC |
michael@0 | 1395 | STMFD r13!,{r0,r14} |
michael@0 | 1396 | ; Stage 1: |
michael@0 | 1397 | ;5-6 rotation by 3pi/16 |
michael@0 | 1398 | LDRD r10,OC_C5S3_8_v6 ; r10= OC_C5S3, r11= OC_C3S5 |
michael@0 | 1399 | LDR r4, [r1,#8] ; r4 = <x[0,5]|x[0,4]> |
michael@0 | 1400 | LDR r7, [r1,#24] ; r7 = <x[1,5]|x[1,4]> |
michael@0 | 1401 | SMULWT r5, r11,r4 ; r5 = OC_C3S5*x[0,5]>>16 |
michael@0 | 1402 | LDR r0, [r1,#4] ; r0 = <x[0,3]|x[0,2]> |
michael@0 | 1403 | SMULWT r3, r11,r7 ; r3 = OC_C3S5*x[1,5]>>16 |
michael@0 | 1404 | LDR r12,[r1,#20] ; r12= <x[1,3]|x[1,2]> |
michael@0 | 1405 | SMULWT r6, r11,r0 ; r6 = OC_C3S5*x[0,3]>>16 |
michael@0 | 1406 | SMULWT r11,r11,r12 ; r11= OC_C3S5*x[1,3]>>16 |
michael@0 | 1407 | SMLAWT r6, r10,r4, r6 ; r6 = t[0,6]=r6+(OC_C5S3*x[0,5]>>16) |
michael@0 | 1408 | PKHBT r5, r5, r3, LSL #16 ; r5 = <r3|r5> |
michael@0 | 1409 | SMLAWT r11,r10,r7, r11 ; r11= t[1,6]=r11+(OC_C5S3*x[1,5]>>16) |
michael@0 | 1410 | PKHBT r4, r4, r7, LSL #16 ; r4 = <x[1,4]|x[0,4]> |
michael@0 | 1411 | SMULWT r3, r10,r0 ; r3 = OC_C5S3*x[0,3]>>16 |
michael@0 | 1412 | PKHBT r6, r6, r11,LSL #16 ; r6 = <t[1,6]|t[0,6]> |
michael@0 | 1413 | SMULWT r8, r10,r12 ; r8 = OC_C5S3*x[1,3]>>16 |
michael@0 | 1414 | ;2-3 rotation by 6pi/16 |
michael@0 | 1415 | LDRD r10,OC_C6S2_8_v6 ; r10= OC_C6S2, r11= OC_C2S6 |
michael@0 | 1416 | PKHBT r3, r3, r8, LSL #16 ; r3 = <r8|r3> |
michael@0 | 1417 | LDR r8, [r1,#12] ; r8 = <x[0,7]|x[0,6]> |
michael@0 | 1418 | SMULWB r2, r10,r0 ; r2 = OC_C6S2*x[0,2]>>16 |
michael@0 | 1419 | SSUB16 r5, r5, r3 ; r5 = <t[1,5]|t[0,5]> |
michael@0 | 1420 | SMULWB r9, r10,r12 ; r9 = OC_C6S2*x[1,2]>>16 |
michael@0 | 1421 | LDR r7, [r1,#28] ; r7 = <x[1,7]|x[1,6]> |
michael@0 | 1422 | SMULWB r3, r10,r8 ; r3 = OC_C6S2*x[0,6]>>16 |
michael@0 | 1423 | SMULWB r10,r10,r7 ; r10= OC_C6S2*x[1,6]>>16 |
michael@0 | 1424 | PKHBT r2, r2, r9, LSL #16 ; r2 = <r2|r9> |
michael@0 | 1425 | SMLAWB r3, r11,r0, r3 ; r3 = t[0,3]=r3+(OC_C2S6*x[0,2]>>16) |
michael@0 | 1426 | SMLAWB r10,r11,r12,r10 ; r10= t[1,3]=r10+(OC_C2S6*x[1,2]>>16) |
michael@0 | 1427 | SMULWB r9, r11,r8 ; r9 = OC_C2S6*x[0,6]>>16 |
michael@0 | 1428 | PKHBT r3, r3, r10,LSL #16 ; r3 = <t[1,6]|t[0,6]> |
michael@0 | 1429 | SMULWB r12,r11,r7 ; r12= OC_C2S6*x[1,6]>>16 |
michael@0 | 1430 | ;4-7 rotation by 7pi/16 |
michael@0 | 1431 | LDRD r10,OC_C7S1_8_v6 ; r10= OC_C7S1, r11= OC_C1S7 |
michael@0 | 1432 | PKHBT r9, r9, r12,LSL #16 ; r9 = <r9|r12> |
michael@0 | 1433 | LDR r0, [r1],#16 ; r0 = <x[0,1]|x[0,0]> |
michael@0 | 1434 | PKHTB r7, r7, r8, ASR #16 ; r7 = <x[1,7]|x[0,7]> |
michael@0 | 1435 | SSUB16 r2, r2, r9 ; r2 = <t[1,2]|t[0,2]> |
michael@0 | 1436 | SMULWB r9, r10,r7 ; r9 = OC_C7S1*x[0,7]>>16 |
michael@0 | 1437 | LDR r14,[r1],#16 ; r14= <x[1,1]|x[1,0]> |
michael@0 | 1438 | SMULWT r12,r10,r7 ; r12= OC_C7S1*x[1,7]>>16 |
michael@0 | 1439 | SMULWT r8, r10,r0 ; r8 = OC_C7S1*x[0,1]>>16 |
michael@0 | 1440 | SMULWT r10,r10,r14 ; r10= OC_C7S1*x[1,1]>>16 |
michael@0 | 1441 | SMLAWT r9, r11,r0, r9 ; r9 = t[0,7]=r9+(OC_C1S7*x[0,1]>>16) |
michael@0 | 1442 | PKHBT r8, r8, r10,LSL #16 ; r8 = <r12|r8> |
michael@0 | 1443 | SMLAWT r12,r11,r14,r12 ; r12= t[1,7]=r12+(OC_C1S7*x[1,1]>>16) |
michael@0 | 1444 | PKHBT r0, r0, r14,LSL #16 ; r0 = <x[1,0]|x[0,0]> |
michael@0 | 1445 | SMULWB r10,r11,r7 ; r10= OC_C1S7*x[0,6]>>16 |
michael@0 | 1446 | PKHBT r9, r9, r12,LSL #16 ; r9 = <t[1,7]|t[0,7]> |
michael@0 | 1447 | SMULWT r12,r11,r7 ; r12= OC_C1S7*x[1,6]>>16 |
michael@0 | 1448 | ;0-1 butterfly |
michael@0 | 1449 | LDR r11,OC_C4S4 |
michael@0 | 1450 | MOV r14,#8 |
michael@0 | 1451 | PKHBT r10,r10,r12,LSL #16 ; r10= <r12|r10> |
michael@0 | 1452 | SADD16 r7, r0, r4 ; r7 = x[0]+x[4] |
michael@0 | 1453 | SSUB16 r10,r8, r10 ; r10= <t[1,4]|t[0,4]> |
michael@0 | 1454 | SMLAWB r8, r11,r7, r14 ; r8 = t[0,0]+8=(OC_C4S4*r7B>>16)+8 |
michael@0 | 1455 | SSUB16 r4, r0, r4 ; r4 = x[0]-x[4] |
michael@0 | 1456 | SMLAWT r12,r11,r7, r14 ; r12= t[1,0]+8=(OC_C4S4*r7T>>16)+8 |
michael@0 | 1457 | SMLAWB r7, r11,r4, r14 ; r7 = t[0,1]+8=(OC_C4S4*r4B>>16)+8 |
michael@0 | 1458 | PKHBT r12,r8, r12,LSL #16 ; r12= <t[1,0]+8|t[0,0]+8> |
michael@0 | 1459 | SMLAWT r8, r11,r4, r14 ; r8 = t[1,1]+8=(OC_C4S4*r4T>>16)+8 |
michael@0 | 1460 | ; Stage 2: |
michael@0 | 1461 | SADD16 r4, r10,r5 ; r4 = t[4]'=t[4]+t[5] |
michael@0 | 1462 | PKHBT r8, r7, r8, LSL #16 ; r8 = <t[1,0]+8|t[0,0]+8> |
michael@0 | 1463 | SSUB16 r5, r10,r5 ; r5 = t[4]-t[5] |
michael@0 | 1464 | SMULWB r10,r11,r5 ; r10= t[0,5]=OC_C4S4*r5B>>16 |
michael@0 | 1465 | SADD16 r7, r9, r6 ; r7 = t[7]'=t[7]+t[6] |
michael@0 | 1466 | SMULWT r5, r11,r5 ; r5 = t[1,5]=OC_C4S4*r5T>>16 |
michael@0 | 1467 | SSUB16 r6, r9, r6 ; r6 = t[7]-t[6] |
michael@0 | 1468 | SMULWB r9, r11,r6 ; r9 = t[0,6]=OC_C4S4*r6B>>16 |
michael@0 | 1469 | PKHBT r10,r10,r5, LSL #16 ; r10= <t[1,5]|t[0,5]> |
michael@0 | 1470 | SMULWT r6, r11,r6 ; r6 = t[1,6]=OC_C4S4*r6T>>16 |
michael@0 | 1471 | ; Stage 3: |
michael@0 | 1472 | SADD16 r11,r8, r2 ; r11= t[1]'+8=t[1]+t[2]+8 |
michael@0 | 1473 | PKHBT r6, r9, r6, LSL #16 ; r6 = <t[1,6]|t[0,6]> |
michael@0 | 1474 | SSUB16 r2, r8, r2 ; r2 = t[2]+8=t[1]-t[2]+8 |
michael@0 | 1475 | LDMFD r13!,{r0,r14} |
michael@0 | 1476 | idct8_8core_down_stage3_5_v6 |
michael@0 | 1477 | SSUB16 r5, r6, r10 ; r5 = t[5]'=t[6]-t[5] |
michael@0 | 1478 | SADD16 r6, r6, r10 ; r6 = t[6]=t[6]+t[5] |
michael@0 | 1479 | SADD16 r10,r12,r3 ; r10= t[0]'+8=t[0]+t[3]+8 |
michael@0 | 1480 | SSUB16 r3, r12,r3 ; r3 = t[3]+8=t[0]-t[3]+8 |
michael@0 | 1481 | ; Stage 4: |
michael@0 | 1482 | SADD16 r12,r10,r7 ; r12= t[0]+t[7]+8 |
michael@0 | 1483 | SSUB16 r7, r10,r7 ; r7 = t[0]-t[7]+8 |
michael@0 | 1484 | MOV r10,r12,ASR #4 |
michael@0 | 1485 | MOV r12,r12,LSL #16 |
michael@0 | 1486 | PKHTB r10,r10,r12,ASR #20 ; r10= t[0]+t[7]+8>>4 |
michael@0 | 1487 | STR r10,[r0], #4 ; y[0<<3] = t[0]+t[7]+8>>4 |
michael@0 | 1488 | SADD16 r12,r11,r6 ; r12= t[1]+t[6]+8 |
michael@0 | 1489 | SSUB16 r6, r11,r6 ; r6 = t[1]-t[6]+8 |
michael@0 | 1490 | MOV r10,r12,ASR #4 |
michael@0 | 1491 | MOV r12,r12,LSL #16 |
michael@0 | 1492 | PKHTB r10,r10,r12,ASR #20 ; r10= t[1]+t[6]+8>>4 |
michael@0 | 1493 | STR r10,[r0, #12] ; y[1<<3] = t[1]+t[6]+8>>4 |
michael@0 | 1494 | SADD16 r12,r2, r5 ; r12= t[2]+t[5]+8 |
michael@0 | 1495 | SSUB16 r5, r2, r5 ; r5 = t[2]-t[5]+8 |
michael@0 | 1496 | MOV r10,r12,ASR #4 |
michael@0 | 1497 | MOV r12,r12,LSL #16 |
michael@0 | 1498 | PKHTB r10,r10,r12,ASR #20 ; r10= t[2]+t[5]+8>>4 |
michael@0 | 1499 | STR r10,[r0, #28] ; y[2<<3] = t[2]+t[5]+8>>4 |
michael@0 | 1500 | SADD16 r12,r3, r4 ; r12= t[3]+t[4]+8 |
michael@0 | 1501 | SSUB16 r4, r3, r4 ; r4 = t[3]-t[4]+8 |
michael@0 | 1502 | MOV r10,r12,ASR #4 |
michael@0 | 1503 | MOV r12,r12,LSL #16 |
michael@0 | 1504 | PKHTB r10,r10,r12,ASR #20 ; r10= t[3]+t[4]+8>>4 |
michael@0 | 1505 | STR r10,[r0, #44] ; y[3<<3] = t[3]+t[4]+8>>4 |
michael@0 | 1506 | MOV r10,r4, ASR #4 |
michael@0 | 1507 | MOV r4, r4, LSL #16 |
michael@0 | 1508 | PKHTB r10,r10,r4, ASR #20 ; r10= t[3]-t[4]+8>>4 |
michael@0 | 1509 | STR r10,[r0, #60] ; y[4<<3] = t[3]-t[4]+8>>4 |
michael@0 | 1510 | MOV r10,r5, ASR #4 |
michael@0 | 1511 | MOV r5, r5, LSL #16 |
michael@0 | 1512 | PKHTB r10,r10,r5, ASR #20 ; r10= t[2]-t[5]+8>>4 |
michael@0 | 1513 | STR r10,[r0, #76] ; y[5<<3] = t[2]-t[5]+8>>4 |
michael@0 | 1514 | MOV r10,r6, ASR #4 |
michael@0 | 1515 | MOV r6, r6, LSL #16 |
michael@0 | 1516 | PKHTB r10,r10,r6, ASR #20 ; r10= t[1]-t[6]+8>>4 |
michael@0 | 1517 | STR r10,[r0, #92] ; y[6<<3] = t[1]-t[6]+8>>4 |
michael@0 | 1518 | MOV r10,r7, ASR #4 |
michael@0 | 1519 | MOV r7, r7, LSL #16 |
michael@0 | 1520 | PKHTB r10,r10,r7, ASR #20 ; r10= t[0]-t[7]+8>>4 |
michael@0 | 1521 | STR r10,[r0, #108] ; y[7<<3] = t[0]-t[7]+8>>4 |
michael@0 | 1522 | MOV PC,r14 |
michael@0 | 1523 | ENDP |
michael@0 | 1524 | ] |
michael@0 | 1525 | |
michael@0 | 1526 | [ OC_ARM_ASM_NEON |
michael@0 | 1527 | EXPORT oc_idct8x8_1_neon |
michael@0 | 1528 | EXPORT oc_idct8x8_neon |
michael@0 | 1529 | |
michael@0 | 1530 | ALIGN 16 |
michael@0 | 1531 | OC_IDCT_CONSTS_NEON |
michael@0 | 1532 | DCW 8 |
michael@0 | 1533 | DCW 64277 ; FB15 (C1S7) |
michael@0 | 1534 | DCW 60547 ; EC83 (C2S6) |
michael@0 | 1535 | DCW 54491 ; D4DB (C3S5) |
michael@0 | 1536 | DCW 46341 ; B505 (C4S4) |
michael@0 | 1537 | DCW 36410 ; 471D (C5S3) |
michael@0 | 1538 | DCW 25080 ; 30FC (C6S2) |
michael@0 | 1539 | DCW 12785 ; 31F1 (C7S1) |
michael@0 | 1540 | |
michael@0 | 1541 | oc_idct8x8_1_neon PROC |
michael@0 | 1542 | ; r0 = ogg_int16_t *_y |
michael@0 | 1543 | ; r1 = ogg_uint16_t _dc |
michael@0 | 1544 | VDUP.S16 Q0, r1 |
michael@0 | 1545 | VMOV Q1, Q0 |
michael@0 | 1546 | VST1.64 {D0, D1, D2, D3}, [r0@128]! |
michael@0 | 1547 | VST1.64 {D0, D1, D2, D3}, [r0@128]! |
michael@0 | 1548 | VST1.64 {D0, D1, D2, D3}, [r0@128]! |
michael@0 | 1549 | VST1.64 {D0, D1, D2, D3}, [r0@128] |
michael@0 | 1550 | MOV PC, r14 |
michael@0 | 1551 | ENDP |
michael@0 | 1552 | |
michael@0 | 1553 | oc_idct8x8_neon PROC |
michael@0 | 1554 | ; r0 = ogg_int16_t *_y |
michael@0 | 1555 | ; r1 = ogg_int16_t *_x |
michael@0 | 1556 | ; r2 = int _last_zzi |
michael@0 | 1557 | CMP r2, #10 |
michael@0 | 1558 | BLE oc_idct8x8_10_neon |
michael@0 | 1559 | oc_idct8x8_slow_neon |
michael@0 | 1560 | VPUSH {D8-D15} |
michael@0 | 1561 | MOV r2, r1 |
michael@0 | 1562 | ADR r3, OC_IDCT_CONSTS_NEON |
michael@0 | 1563 | ; Row transforms (input is pre-transposed) |
michael@0 | 1564 | VLD1.64 {D16,D17,D18,D19}, [r2@128]! |
michael@0 | 1565 | VLD1.64 {D20,D21,D22,D23}, [r2@128]! |
michael@0 | 1566 | VLD1.64 {D24,D25,D26,D27}, [r2@128]! |
michael@0 | 1567 | VSUB.S16 Q1, Q8, Q12 ; Q8 = x[0]-x[4] |
michael@0 | 1568 | VLD1.64 {D28,D29,D30,D31}, [r2@128] |
michael@0 | 1569 | VADD.S16 Q8, Q8, Q12 ; Q1 = x[0]+x[4] |
michael@0 | 1570 | VLD1.64 {D0,D1}, [r3@128] |
michael@0 | 1571 | MOV r12, r14 |
michael@0 | 1572 | BL oc_idct8x8_stage123_neon |
michael@0 | 1573 | ; Stage 4 |
michael@0 | 1574 | VSUB.S16 Q15,Q8, Q7 ; Q15 = y[7]=t[0]'-t[7]' |
michael@0 | 1575 | VADD.S16 Q8, Q8, Q7 ; Q8 = y[0]=t[0]'+t[7]' |
michael@0 | 1576 | VSUB.S16 Q14,Q9, Q3 ; Q14 = y[6]=t[1]'-t[6]'' |
michael@0 | 1577 | VADD.S16 Q9, Q9, Q3 ; Q9 = y[1]=t[1]'+t[6]'' |
michael@0 | 1578 | VSUB.S16 Q13,Q10,Q5 ; Q13 = y[5]=t[2]'-t[5]'' |
michael@0 | 1579 | VADD.S16 Q10,Q10,Q5 ; Q10 = y[2]=t[2]'+t[5]'' |
michael@0 | 1580 | VTRN.16 Q14,Q15 |
michael@0 | 1581 | VSUB.S16 Q12,Q11,Q4 ; Q12 = y[4]=t[3]'-t[4]' |
michael@0 | 1582 | VADD.S16 Q11,Q11,Q4 ; Q11 = y[3]=t[3]'+t[4]' |
michael@0 | 1583 | ; 8x8 Transpose |
michael@0 | 1584 | VTRN.16 Q8, Q9 |
michael@0 | 1585 | VTRN.16 Q10,Q11 |
michael@0 | 1586 | VTRN.16 Q12,Q13 |
michael@0 | 1587 | VTRN.32 Q8, Q10 |
michael@0 | 1588 | VTRN.32 Q9, Q11 |
michael@0 | 1589 | VTRN.32 Q12,Q14 |
michael@0 | 1590 | VTRN.32 Q13,Q15 |
michael@0 | 1591 | VSWP D17,D24 |
michael@0 | 1592 | VSUB.S16 Q1, Q8, Q12 ; Q8 = x[0]-x[4] |
michael@0 | 1593 | VSWP D19,D26 |
michael@0 | 1594 | VADD.S16 Q8, Q8, Q12 ; Q1 = x[0]+x[4] |
michael@0 | 1595 | VSWP D21,D28 |
michael@0 | 1596 | VSWP D23,D30 |
michael@0 | 1597 | ; Column transforms |
michael@0 | 1598 | BL oc_idct8x8_stage123_neon |
michael@0 | 1599 | CMP r0,r1 |
michael@0 | 1600 | ; We have to put the return address back in the LR, or the branch |
michael@0 | 1601 | ; predictor will not recognize the function return and mis-predict the |
michael@0 | 1602 | ; entire call stack. |
michael@0 | 1603 | MOV r14, r12 |
michael@0 | 1604 | ; Stage 4 |
michael@0 | 1605 | VSUB.S16 Q15,Q8, Q7 ; Q15 = y[7]=t[0]'-t[7]' |
michael@0 | 1606 | VADD.S16 Q8, Q8, Q7 ; Q8 = y[0]=t[0]'+t[7]' |
michael@0 | 1607 | VSUB.S16 Q14,Q9, Q3 ; Q14 = y[6]=t[1]'-t[6]'' |
michael@0 | 1608 | VADD.S16 Q9, Q9, Q3 ; Q9 = y[1]=t[1]'+t[6]'' |
michael@0 | 1609 | VSUB.S16 Q13,Q10,Q5 ; Q13 = y[5]=t[2]'-t[5]'' |
michael@0 | 1610 | VADD.S16 Q10,Q10,Q5 ; Q10 = y[2]=t[2]'+t[5]'' |
michael@0 | 1611 | VSUB.S16 Q12,Q11,Q4 ; Q12 = y[4]=t[3]'-t[4]' |
michael@0 | 1612 | VADD.S16 Q11,Q11,Q4 ; Q11 = y[3]=t[3]'+t[4]' |
michael@0 | 1613 | BEQ oc_idct8x8_slow_neon_noclear |
michael@0 | 1614 | VMOV.I8 Q2,#0 |
michael@0 | 1615 | VPOP {D8-D15} |
michael@0 | 1616 | VMOV.I8 Q3,#0 |
michael@0 | 1617 | VRSHR.S16 Q8, Q8, #4 ; Q8 = y[0]+8>>4 |
michael@0 | 1618 | VST1.64 {D4, D5, D6, D7}, [r1@128]! |
michael@0 | 1619 | VRSHR.S16 Q9, Q9, #4 ; Q9 = y[1]+8>>4 |
michael@0 | 1620 | VRSHR.S16 Q10,Q10,#4 ; Q10 = y[2]+8>>4 |
michael@0 | 1621 | VST1.64 {D4, D5, D6, D7}, [r1@128]! |
michael@0 | 1622 | VRSHR.S16 Q11,Q11,#4 ; Q11 = y[3]+8>>4 |
michael@0 | 1623 | VRSHR.S16 Q12,Q12,#4 ; Q12 = y[4]+8>>4 |
michael@0 | 1624 | VST1.64 {D4, D5, D6, D7}, [r1@128]! |
michael@0 | 1625 | VRSHR.S16 Q13,Q13,#4 ; Q13 = y[5]+8>>4 |
michael@0 | 1626 | VRSHR.S16 Q14,Q14,#4 ; Q14 = y[6]+8>>4 |
michael@0 | 1627 | VST1.64 {D4, D5, D6, D7}, [r1@128] |
michael@0 | 1628 | VRSHR.S16 Q15,Q15,#4 ; Q15 = y[7]+8>>4 |
michael@0 | 1629 | VSTMIA r0, {D16-D31} |
michael@0 | 1630 | MOV PC, r14 |
michael@0 | 1631 | |
michael@0 | 1632 | oc_idct8x8_slow_neon_noclear |
michael@0 | 1633 | VPOP {D8-D15} |
michael@0 | 1634 | VRSHR.S16 Q8, Q8, #4 ; Q8 = y[0]+8>>4 |
michael@0 | 1635 | VRSHR.S16 Q9, Q9, #4 ; Q9 = y[1]+8>>4 |
michael@0 | 1636 | VRSHR.S16 Q10,Q10,#4 ; Q10 = y[2]+8>>4 |
michael@0 | 1637 | VRSHR.S16 Q11,Q11,#4 ; Q11 = y[3]+8>>4 |
michael@0 | 1638 | VRSHR.S16 Q12,Q12,#4 ; Q12 = y[4]+8>>4 |
michael@0 | 1639 | VRSHR.S16 Q13,Q13,#4 ; Q13 = y[5]+8>>4 |
michael@0 | 1640 | VRSHR.S16 Q14,Q14,#4 ; Q14 = y[6]+8>>4 |
michael@0 | 1641 | VRSHR.S16 Q15,Q15,#4 ; Q15 = y[7]+8>>4 |
michael@0 | 1642 | VSTMIA r0, {D16-D31} |
michael@0 | 1643 | MOV PC, r14 |
michael@0 | 1644 | ENDP |
michael@0 | 1645 | |
michael@0 | 1646 | oc_idct8x8_stage123_neon PROC |
michael@0 | 1647 | ; Stages 1 & 2 |
michael@0 | 1648 | VMULL.S16 Q4, D18,D1[3] |
michael@0 | 1649 | VMULL.S16 Q5, D19,D1[3] |
michael@0 | 1650 | VMULL.S16 Q7, D30,D1[3] |
michael@0 | 1651 | VMULL.S16 Q6, D31,D1[3] |
michael@0 | 1652 | VMULL.S16 Q2, D30,D0[1] |
michael@0 | 1653 | VMULL.S16 Q3, D31,D0[1] |
michael@0 | 1654 | VSHRN.S32 D8, Q4, #16 |
michael@0 | 1655 | VSHRN.S32 D9, Q5, #16 ; Q4 = (OC_C7S1*x[1]>>16) |
michael@0 | 1656 | VSHRN.S32 D14,Q7, #16 |
michael@0 | 1657 | VSHRN.S32 D15,Q6, #16 ; Q7 = (OC_C7S1*x[7]>>16) |
michael@0 | 1658 | VSHRN.S32 D4, Q2, #16 |
michael@0 | 1659 | VSHRN.S32 D5, Q3, #16 ; Q2 = (OC_C1S7*x[7]>>16)-x[7] |
michael@0 | 1660 | VSUB.S16 Q4, Q4, Q15 |
michael@0 | 1661 | VADD.S16 Q7, Q7, Q9 |
michael@0 | 1662 | VSUB.S16 Q4, Q4, Q2 ; Q4 = t[4] |
michael@0 | 1663 | VMULL.S16 Q2, D18,D0[1] |
michael@0 | 1664 | VMULL.S16 Q9, D19,D0[1] |
michael@0 | 1665 | VMULL.S16 Q5, D26,D0[3] |
michael@0 | 1666 | VMULL.S16 Q3, D27,D0[3] |
michael@0 | 1667 | VMULL.S16 Q6, D22,D0[3] |
michael@0 | 1668 | VMULL.S16 Q12,D23,D0[3] |
michael@0 | 1669 | VSHRN.S32 D4, Q2, #16 |
michael@0 | 1670 | VSHRN.S32 D5, Q9, #16 ; Q2 = (OC_C1S7*x[1]>>16)-x[1] |
michael@0 | 1671 | VSHRN.S32 D10,Q5, #16 |
michael@0 | 1672 | VSHRN.S32 D11,Q3, #16 ; Q5 = (OC_C3S5*x[5]>>16)-x[5] |
michael@0 | 1673 | VSHRN.S32 D12,Q6, #16 |
michael@0 | 1674 | VSHRN.S32 D13,Q12,#16 ; Q6 = (OC_C3S5*x[3]>>16)-x[3] |
michael@0 | 1675 | VADD.S16 Q7, Q7, Q2 ; Q7 = t[7] |
michael@0 | 1676 | VSUB.S16 Q5, Q5, Q11 |
michael@0 | 1677 | VADD.S16 Q6, Q6, Q11 |
michael@0 | 1678 | VADD.S16 Q5, Q5, Q13 |
michael@0 | 1679 | VADD.S16 Q6, Q6, Q13 |
michael@0 | 1680 | VMULL.S16 Q9, D22,D1[1] |
michael@0 | 1681 | VMULL.S16 Q11,D23,D1[1] |
michael@0 | 1682 | VMULL.S16 Q15,D26,D1[1] |
michael@0 | 1683 | VMULL.S16 Q13,D27,D1[1] |
michael@0 | 1684 | VMULL.S16 Q2, D20,D1[2] |
michael@0 | 1685 | VMULL.S16 Q12,D21,D1[2] |
michael@0 | 1686 | VSHRN.S32 D18,Q9, #16 |
michael@0 | 1687 | VSHRN.S32 D19,Q11,#16 ; Q9 = (OC_C5S3*x[3]>>16)-x[3] |
michael@0 | 1688 | VSHRN.S32 D30,Q15,#16 |
michael@0 | 1689 | VSHRN.S32 D31,Q13,#16 ; Q15= (OC_C5S3*x[5]>>16)-x[5] |
michael@0 | 1690 | VSHRN.S32 D4, Q2, #16 |
michael@0 | 1691 | VSHRN.S32 D5, Q12,#16 ; Q2 = (OC_C6S2*x[2]>>16) |
michael@0 | 1692 | VSUB.S16 Q5, Q5, Q9 ; Q5 = t[5] |
michael@0 | 1693 | VADD.S16 Q6, Q6, Q15 ; Q6 = t[6] |
michael@0 | 1694 | VSUB.S16 Q2, Q2, Q14 |
michael@0 | 1695 | VMULL.S16 Q3, D28,D1[2] |
michael@0 | 1696 | VMULL.S16 Q11,D29,D1[2] |
michael@0 | 1697 | VMULL.S16 Q12,D28,D0[2] |
michael@0 | 1698 | VMULL.S16 Q9, D29,D0[2] |
michael@0 | 1699 | VMULL.S16 Q13,D20,D0[2] |
michael@0 | 1700 | VMULL.S16 Q15,D21,D0[2] |
michael@0 | 1701 | VSHRN.S32 D6, Q3, #16 |
michael@0 | 1702 | VSHRN.S32 D7, Q11,#16 ; Q3 = (OC_C6S2*x[6]>>16) |
michael@0 | 1703 | VSHRN.S32 D24,Q12,#16 |
michael@0 | 1704 | VSHRN.S32 D25,Q9, #16 ; Q12= (OC_C2S6*x[6]>>16)-x[6] |
michael@0 | 1705 | VSHRN.S32 D26,Q13,#16 |
michael@0 | 1706 | VSHRN.S32 D27,Q15,#16 ; Q13= (OC_C2S6*x[2]>>16)-x[2] |
michael@0 | 1707 | VSUB.S16 Q9, Q4, Q5 ; Q9 = t[4]-t[5] |
michael@0 | 1708 | VSUB.S16 Q11,Q7, Q6 ; Q11= t[7]-t[6] |
michael@0 | 1709 | VADD.S16 Q3, Q3, Q10 |
michael@0 | 1710 | VADD.S16 Q4, Q4, Q5 ; Q4 = t[4]'=t[4]+t[5] |
michael@0 | 1711 | VADD.S16 Q7, Q7, Q6 ; Q7 = t[7]'=t[7]+t[6] |
michael@0 | 1712 | VSUB.S16 Q2, Q2, Q12 ; Q2 = t[2] |
michael@0 | 1713 | VADD.S16 Q3, Q3, Q13 ; Q3 = t[3] |
michael@0 | 1714 | VMULL.S16 Q12,D16,D1[0] |
michael@0 | 1715 | VMULL.S16 Q13,D17,D1[0] |
michael@0 | 1716 | VMULL.S16 Q14,D2, D1[0] |
michael@0 | 1717 | VMULL.S16 Q15,D3, D1[0] |
michael@0 | 1718 | VMULL.S16 Q5, D18,D1[0] |
michael@0 | 1719 | VMULL.S16 Q6, D22,D1[0] |
michael@0 | 1720 | VSHRN.S32 D24,Q12,#16 |
michael@0 | 1721 | VSHRN.S32 D25,Q13,#16 |
michael@0 | 1722 | VSHRN.S32 D28,Q14,#16 |
michael@0 | 1723 | VSHRN.S32 D29,Q15,#16 |
michael@0 | 1724 | VMULL.S16 Q13,D19,D1[0] |
michael@0 | 1725 | VMULL.S16 Q15,D23,D1[0] |
michael@0 | 1726 | VADD.S16 Q8, Q8, Q12 ; Q8 = t[0] |
michael@0 | 1727 | VADD.S16 Q1, Q1, Q14 ; Q1 = t[1] |
michael@0 | 1728 | VSHRN.S32 D10,Q5, #16 |
michael@0 | 1729 | VSHRN.S32 D12,Q6, #16 |
michael@0 | 1730 | VSHRN.S32 D11,Q13,#16 |
michael@0 | 1731 | VSHRN.S32 D13,Q15,#16 |
michael@0 | 1732 | VADD.S16 Q5, Q5, Q9 ; Q5 = t[5]'=OC_C4S4*(t[4]-t[5])>>16 |
michael@0 | 1733 | VADD.S16 Q6, Q6, Q11 ; Q6 = t[6]'=OC_C4S4*(t[7]-t[6])>>16 |
michael@0 | 1734 | ; Stage 3 |
michael@0 | 1735 | VSUB.S16 Q11,Q8, Q3 ; Q11 = t[3]''=t[0]-t[3] |
michael@0 | 1736 | VADD.S16 Q8, Q8, Q3 ; Q8 = t[0]''=t[0]+t[3] |
michael@0 | 1737 | VADD.S16 Q9, Q1, Q2 ; Q9 = t[1]''=t[1]+t[2] |
michael@0 | 1738 | VADD.S16 Q3, Q6, Q5 ; Q3 = t[6]''=t[6]'+t[5]' |
michael@0 | 1739 | VSUB.S16 Q10,Q1, Q2 ; Q10 = t[2]''=t[1]-t[2] |
michael@0 | 1740 | VSUB.S16 Q5, Q6, Q5 ; Q5 = t[5]''=t[6]'-t[5]' |
michael@0 | 1741 | MOV PC, r14 |
michael@0 | 1742 | ENDP |
michael@0 | 1743 | |
michael@0 | 1744 | oc_idct8x8_10_neon PROC |
michael@0 | 1745 | ADR r3, OC_IDCT_CONSTS_NEON |
michael@0 | 1746 | VLD1.64 {D0,D1}, [r3@128] |
michael@0 | 1747 | MOV r2, r1 |
michael@0 | 1748 | ; Row transforms (input is pre-transposed) |
michael@0 | 1749 | ; Stage 1 |
michael@0 | 1750 | VLD1.64 {D16,D17,D18,D19},[r2@128]! |
michael@0 | 1751 | MOV r12, #16 |
michael@0 | 1752 | VMULL.S16 Q15,D16,D1[0] ; Q15= OC_C4S4*x[0]-(x[0]<<16) |
michael@0 | 1753 | VLD1.64 {D17}, [r2@64], r12 |
michael@0 | 1754 | VMULL.S16 Q2, D18,D0[1] ; Q2 = OC_C1S7*x[1]-(x[1]<<16) |
michael@0 | 1755 | VLD1.64 {D19}, [r2@64] |
michael@0 | 1756 | VMULL.S16 Q14,D17,D0[2] ; Q14= OC_C2S6*x[2]-(x[2]<<16) |
michael@0 | 1757 | VMULL.S16 Q3, D19,D0[3] ; Q3 = OC_C3S5*x[3]-(x[3]<<16) |
michael@0 | 1758 | VMULL.S16 Q13,D19,D1[1] ; Q13= OC_C5S3*x[3]-(x[3]<<16) |
michael@0 | 1759 | VMULL.S16 Q12,D18,D1[3] ; Q12= OC_C7S1*x[1] |
michael@0 | 1760 | VMULL.S16 Q1, D17,D1[2] ; Q1 = OC_C6S2*x[2] |
michael@0 | 1761 | VSHRN.S32 D30,Q15,#16 ; D30= t[0]-x[0] |
michael@0 | 1762 | VSHRN.S32 D4, Q2, #16 ; D4 = t[7]-x[1] |
michael@0 | 1763 | VSHRN.S32 D31,Q14,#16 ; D31= t[3]-x[2] |
michael@0 | 1764 | VSHRN.S32 D6, Q3, #16 ; D6 = t[6]-x[3] |
michael@0 | 1765 | VSHRN.S32 D7, Q13,#16 ; D7 = -t[5]-x[3] |
michael@0 | 1766 | VSHRN.S32 D5, Q12,#16 ; D5 = t[4] |
michael@0 | 1767 | VSHRN.S32 D2, Q1, #16 ; D2 = t[2] |
michael@0 | 1768 | VADD.S16 D4, D4, D18 ; D4 = t[7] |
michael@0 | 1769 | VADD.S16 D6, D6, D19 ; D6 = t[6] |
michael@0 | 1770 | VADD.S16 D7, D7, D19 ; D7 = -t[5] |
michael@0 | 1771 | VADD.S16 Q15,Q15,Q8 ; D30= t[0] |
michael@0 | 1772 | ; D31= t[3] |
michael@0 | 1773 | ; Stages 2 & 3 |
michael@0 | 1774 | VSUB.S16 Q12,Q2, Q3 ; D24= t[7]-t[6] |
michael@0 | 1775 | ; D25= t[4]'=t[4]+t[5] |
michael@0 | 1776 | VADD.S16 Q13,Q2, Q3 ; D26= t[7]'=t[7]+t[6] |
michael@0 | 1777 | ; D27= t[4]-t[5] |
michael@0 | 1778 | VMULL.S16 Q11,D24,D1[0] ; Q11= OC_C4S4*(t[7]-t[6]) |
michael@0 | 1779 | ; -(t[7]-t[6]<<16) |
michael@0 | 1780 | VMULL.S16 Q14,D27,D1[0] ; Q14= OC_C4S4*(t[4]-t[5]) |
michael@0 | 1781 | ; -(t[4]-t[5]<<16) |
michael@0 | 1782 | VADD.S16 D16,D30,D31 ; D16= t[0]'=t[0]+t[3] |
michael@0 | 1783 | VSUB.S16 D17,D30,D2 ; D17= t[2]'=t[0]-t[2] |
michael@0 | 1784 | VADD.S16 D18,D30,D2 ; D18= t[1]'=t[0]+t[2] |
michael@0 | 1785 | VSHRN.S32 D22,Q11,#16 ; D22= (OC_C4S4*(t[7]-t[6])>>16) |
michael@0 | 1786 | ; -(t[7]-t[6]) |
michael@0 | 1787 | VSHRN.S32 D23,Q14,#16 ; D23= (OC_C4S4*(t[4]-t[5])>>16) |
michael@0 | 1788 | ; -(t[4]-t[5]) |
michael@0 | 1789 | VSUB.S16 D19,D30,D31 ; D19= t[3]'=t[0]-t[3] |
michael@0 | 1790 | VADD.S16 D22,D22,D24 ; D22= t[6]'=OC_C4S4*(t[7]-t[6])>>16 |
michael@0 | 1791 | VADD.S16 D23,D23,D27 ; D23= t[5]'=OC_C4S4*(t[4]-t[5])>>16 |
michael@0 | 1792 | VSUB.S16 D27,D22,D23 ; D27= t[5]''=t[6]'-t[5]' |
michael@0 | 1793 | VADD.S16 D24,D22,D23 ; D24= t[6]''=t[6]'+t[5]' |
michael@0 | 1794 | ; Stage 4 |
michael@0 | 1795 | VSUB.S16 Q11,Q8, Q13 ; D22= y[7]=t[0]'-t[7]' |
michael@0 | 1796 | ; D23= y[5]=t[2]'-t[5]'' |
michael@0 | 1797 | VSUB.S16 Q10,Q9, Q12 ; D20= y[6]=t[1]'-t[6]' |
michael@0 | 1798 | ; D21= y[4]=t[3]'-t[4]'' |
michael@0 | 1799 | VADD.S16 Q8, Q8, Q13 ; D16= y[0]=t[0]'+t[7]' |
michael@0 | 1800 | ; D17= y[2]=t[2]'+t[5]'' |
michael@0 | 1801 | VADD.S16 Q9, Q9, Q12 ; D18= y[1]=t[1]'-t[6]' |
michael@0 | 1802 | ; D19= y[3]=t[3]'-t[4]'' |
michael@0 | 1803 | ; 8x4 transpose |
michael@0 | 1804 | VTRN.16 Q10,Q11 ; Q10= c5c4a5a4 c7c6a7a6 |
michael@0 | 1805 | ; Q11= d5d4b5b4 d7d6b7b6 |
michael@0 | 1806 | VTRN.16 Q8, Q9 ; Q8 = c3c2a3a2 c1c0a1a0 |
michael@0 | 1807 | ; Q9 = d3d2b3b2 d1d0b1b0 |
michael@0 | 1808 | VSWP D20,D21 ; Q10= c7c6a7a6 c5c4a5a4 |
michael@0 | 1809 | VSWP D22,D23 ; Q11= d7d6b7b6 d5d4b5b4 |
michael@0 | 1810 | VUZP.32 Q9, Q11 ; Q9 = b7b6b5b4 b3b2b1b0 |
michael@0 | 1811 | ; Q11= d7d6d5d4 d3d2d1d0 |
michael@0 | 1812 | VMULL.S16 Q15,D18,D0[1] |
michael@0 | 1813 | VMULL.S16 Q13,D22,D1[1] |
michael@0 | 1814 | VUZP.32 Q8, Q10 ; Q8 = a7a6a5a4 a3a2a1a0 |
michael@0 | 1815 | ; Q10= c7c6c5c4 c3c2c1c0 |
michael@0 | 1816 | ; Column transforms |
michael@0 | 1817 | ; Stages 1, 2, & 3 |
michael@0 | 1818 | VMULL.S16 Q14,D19,D0[1] ; Q14:Q15= OC_C1S7*x[1]-(x[1]<<16) |
michael@0 | 1819 | VMULL.S16 Q12,D23,D1[1] ; Q12:Q13= OC_C5S3*x[3]-(x[3]<<16) |
michael@0 | 1820 | VMULL.S16 Q3, D22,D0[3] |
michael@0 | 1821 | VMULL.S16 Q2, D23,D0[3] ; Q2:Q3 = OC_C3S5*x[3]-(x[3]<<16) |
michael@0 | 1822 | VSHRN.S32 D30,Q15,#16 |
michael@0 | 1823 | VSHRN.S32 D31,Q14,#16 ; Q15= (OC_C1S7*x[1]>>16)-x[1] |
michael@0 | 1824 | VSHRN.S32 D26,Q13,#16 |
michael@0 | 1825 | VSHRN.S32 D27,Q12,#16 ; Q13= (OC_C5S3*x[3]>>16)-x[3] |
michael@0 | 1826 | VSHRN.S32 D28,Q3, #16 |
michael@0 | 1827 | VSHRN.S32 D29,Q2, #16 ; Q14= (OC_C3S5*x[3]>>16)-x[3] |
michael@0 | 1828 | VADD.S16 Q15,Q15,Q9 ; Q15= t[7] |
michael@0 | 1829 | VADD.S16 Q13,Q13,Q11 ; Q13= -t[5] |
michael@0 | 1830 | VADD.S16 Q14,Q14,Q11 ; Q14= t[6] |
michael@0 | 1831 | VMULL.S16 Q12,D18,D1[3] |
michael@0 | 1832 | VMULL.S16 Q2, D19,D1[3] ; Q2:Q12= OC_C7S1*x[1] |
michael@0 | 1833 | VMULL.S16 Q1, D16,D1[0] |
michael@0 | 1834 | VMULL.S16 Q11,D17,D1[0] ; Q11:Q1 = OC_C4S4*x[0]-(x[0]<<16) |
michael@0 | 1835 | VMULL.S16 Q3, D20,D0[2] |
michael@0 | 1836 | VMULL.S16 Q9, D21,D0[2] ; Q9:Q3 = OC_C2S6*x[2]-(x[2]<<16) |
michael@0 | 1837 | VSHRN.S32 D24,Q12,#16 |
michael@0 | 1838 | VSHRN.S32 D25,Q2, #16 ; Q12= t[4] |
michael@0 | 1839 | VMULL.S16 Q2, D20,D1[2] |
michael@0 | 1840 | VSHRN.S32 D2, Q1, #16 |
michael@0 | 1841 | VSHRN.S32 D3, Q11,#16 ; Q1 = (OC_C4S4*x[0]>>16)-x[0] |
michael@0 | 1842 | VMULL.S16 Q11,D21,D1[2] ; Q2:Q11= OC_C6S2*x[2] |
michael@0 | 1843 | VSHRN.S32 D6, Q3, #16 |
michael@0 | 1844 | VSHRN.S32 D7, Q9, #16 ; Q3 = (OC_C2S6*x[2]>>16)-x[2] |
michael@0 | 1845 | VSUB.S16 Q9, Q15,Q14 ; Q9 = t[7]-t[6] |
michael@0 | 1846 | VADD.S16 Q15,Q15,Q14 ; Q15= t[7]'=t[7]+t[6] |
michael@0 | 1847 | VSHRN.S32 D4, Q2, #16 |
michael@0 | 1848 | VSHRN.S32 D5, Q11,#16 ; Q2 = t[2] |
michael@0 | 1849 | VADD.S16 Q1, Q1, Q8 ; Q1 = t[0] |
michael@0 | 1850 | VADD.S16 Q8, Q12,Q13 ; Q8 = t[4]-t[5] |
michael@0 | 1851 | VADD.S16 Q3, Q3, Q10 ; Q3 = t[3] |
michael@0 | 1852 | VMULL.S16 Q10,D16,D1[0] |
michael@0 | 1853 | VMULL.S16 Q11,D17,D1[0] ; Q11:Q10= OC_C4S4*(t[4]-t[5]) |
michael@0 | 1854 | ; -(t[4]-t[5]<<16) |
michael@0 | 1855 | VSUB.S16 Q12,Q12,Q13 ; Q12= t[4]'=t[4]+t[5] |
michael@0 | 1856 | VMULL.S16 Q14,D18,D1[0] |
michael@0 | 1857 | VMULL.S16 Q13,D19,D1[0] ; Q13:Q14= OC_C4S4*(t[6]-t[7]) |
michael@0 | 1858 | ; -(t[6]-t[7]<<16) |
michael@0 | 1859 | VSHRN.S32 D20,Q10,#16 |
michael@0 | 1860 | VSHRN.S32 D21,Q11,#16 ; Q10= (OC_C4S4*(t[4]-t[5])>>16) |
michael@0 | 1861 | ; -(t[4]-t[5]) |
michael@0 | 1862 | VADD.S16 Q11,Q1, Q3 ; Q11= t[0]'=t[0]+t[3] |
michael@0 | 1863 | VSUB.S16 Q3, Q1, Q3 ; Q3 = t[3]'=t[0]-t[3] |
michael@0 | 1864 | VSHRN.S32 D28,Q14,#16 |
michael@0 | 1865 | VSHRN.S32 D29,Q13,#16 ; Q14= (OC_C4S4*(t[7]-t[6])>>16) |
michael@0 | 1866 | ; -(t[7]-t[6]) |
michael@0 | 1867 | VADD.S16 Q10,Q10,Q8 ; Q10=t[5]' |
michael@0 | 1868 | VADD.S16 Q14,Q14,Q9 ; Q14=t[6]' |
michael@0 | 1869 | VSUB.S16 Q13,Q14,Q10 ; Q13=t[5]''=t[6]'-t[5]' |
michael@0 | 1870 | VADD.S16 Q14,Q14,Q10 ; Q14=t[6]''=t[6]'+t[5]' |
michael@0 | 1871 | VADD.S16 Q10,Q1, Q2 ; Q10= t[1]'=t[0]+t[2] |
michael@0 | 1872 | VSUB.S16 Q2, Q1, Q2 ; Q2 = t[2]'=t[0]-t[2] |
michael@0 | 1873 | ; Stage 4 |
michael@0 | 1874 | CMP r0, r1 |
michael@0 | 1875 | VADD.S16 Q8, Q11,Q15 ; Q8 = y[0]=t[0]'+t[7]' |
michael@0 | 1876 | VADD.S16 Q9, Q10,Q14 ; Q9 = y[1]=t[1]'+t[6]'' |
michael@0 | 1877 | VSUB.S16 Q15,Q11,Q15 ; Q15 = y[7]=t[0]'-t[7]' |
michael@0 | 1878 | VSUB.S16 Q14,Q10,Q14 ; Q14 = y[6]=t[1]'-t[6]'' |
michael@0 | 1879 | VADD.S16 Q10,Q2, Q13 ; Q10 = y[2]=t[2]'+t[5]'' |
michael@0 | 1880 | VADD.S16 Q11,Q3, Q12 ; Q11 = y[3]=t[3]'+t[4]' |
michael@0 | 1881 | VSUB.S16 Q12,Q3, Q12 ; Q12 = y[4]=t[3]'-t[4]' |
michael@0 | 1882 | VSUB.S16 Q13,Q2, Q13 ; Q13 = y[5]=t[2]'-t[5]'' |
michael@0 | 1883 | BEQ oc_idct8x8_10_neon_noclear |
michael@0 | 1884 | VMOV.I8 D2, #0 |
michael@0 | 1885 | VRSHR.S16 Q8, Q8, #4 ; Q8 = y[0]+8>>4 |
michael@0 | 1886 | VST1.64 {D2}, [r1@64], r12 |
michael@0 | 1887 | VRSHR.S16 Q9, Q9, #4 ; Q9 = y[1]+8>>4 |
michael@0 | 1888 | VRSHR.S16 Q10,Q10,#4 ; Q10 = y[2]+8>>4 |
michael@0 | 1889 | VST1.64 {D2}, [r1@64], r12 |
michael@0 | 1890 | VRSHR.S16 Q11,Q11,#4 ; Q11 = y[3]+8>>4 |
michael@0 | 1891 | VRSHR.S16 Q12,Q12,#4 ; Q12 = y[4]+8>>4 |
michael@0 | 1892 | VST1.64 {D2}, [r1@64], r12 |
michael@0 | 1893 | VRSHR.S16 Q13,Q13,#4 ; Q13 = y[5]+8>>4 |
michael@0 | 1894 | VRSHR.S16 Q14,Q14,#4 ; Q14 = y[6]+8>>4 |
michael@0 | 1895 | VST1.64 {D2}, [r1@64] |
michael@0 | 1896 | VRSHR.S16 Q15,Q15,#4 ; Q15 = y[7]+8>>4 |
michael@0 | 1897 | VSTMIA r0, {D16-D31} |
michael@0 | 1898 | MOV PC, r14 |
michael@0 | 1899 | |
michael@0 | 1900 | oc_idct8x8_10_neon_noclear |
michael@0 | 1901 | VRSHR.S16 Q8, Q8, #4 ; Q8 = y[0]+8>>4 |
michael@0 | 1902 | VRSHR.S16 Q9, Q9, #4 ; Q9 = y[1]+8>>4 |
michael@0 | 1903 | VRSHR.S16 Q10,Q10,#4 ; Q10 = y[2]+8>>4 |
michael@0 | 1904 | VRSHR.S16 Q11,Q11,#4 ; Q11 = y[3]+8>>4 |
michael@0 | 1905 | VRSHR.S16 Q12,Q12,#4 ; Q12 = y[4]+8>>4 |
michael@0 | 1906 | VRSHR.S16 Q13,Q13,#4 ; Q13 = y[5]+8>>4 |
michael@0 | 1907 | VRSHR.S16 Q14,Q14,#4 ; Q14 = y[6]+8>>4 |
michael@0 | 1908 | VRSHR.S16 Q15,Q15,#4 ; Q15 = y[7]+8>>4 |
michael@0 | 1909 | VSTMIA r0, {D16-D31} |
michael@0 | 1910 | MOV PC, r14 |
michael@0 | 1911 | ENDP |
michael@0 | 1912 | ] |
michael@0 | 1913 | |
michael@0 | 1914 | END |