1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/media/libvpx/vp8/encoder/arm/armv5te/vp8_packtokens_mbrow_armv5.asm Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,352 @@ 1.4 +; 1.5 +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 1.6 +; 1.7 +; Use of this source code is governed by a BSD-style license 1.8 +; that can be found in the LICENSE file in the root of the source 1.9 +; tree. An additional intellectual property rights grant can be found 1.10 +; in the file PATENTS. All contributing project authors may 1.11 +; be found in the AUTHORS file in the root of the source tree. 1.12 +; 1.13 + 1.14 + 1.15 + EXPORT |vp8cx_pack_mb_row_tokens_armv5| 1.16 + IMPORT |vp8_validate_buffer_arm| 1.17 + 1.18 + INCLUDE vp8_asm_enc_offsets.asm 1.19 + 1.20 + ARM 1.21 + REQUIRE8 1.22 + PRESERVE8 1.23 + 1.24 + AREA |.text|, CODE, READONLY 1.25 + 1.26 + 1.27 + ; macro for validating write buffer position 1.28 + ; needs vp8_writer in r0 1.29 + ; start shall not be in r1 1.30 + MACRO 1.31 + VALIDATE_POS $start, $pos 1.32 + push {r0-r3, r12, lr} ; rest of regs are preserved by subroutine call 1.33 + ldr r2, [r0, #vp8_writer_buffer_end] 1.34 + ldr r3, [r0, #vp8_writer_error] 1.35 + mov r1, $pos 1.36 + mov r0, $start 1.37 + bl vp8_validate_buffer_arm 1.38 + pop {r0-r3, r12, lr} 1.39 + MEND 1.40 + 1.41 +; r0 VP8_COMP *cpi 1.42 +; r1 vp8_writer *w 1.43 +; r2 vp8_coef_encodings 1.44 +; r3 vp8_extra_bits 1.45 +; s0 vp8_coef_tree 1.46 + 1.47 +|vp8cx_pack_mb_row_tokens_armv5| PROC 1.48 + push {r4-r12, lr} 1.49 + sub sp, sp, #24 1.50 + 1.51 + ; Compute address of cpi->common.mb_rows 1.52 + ldr r4, _VP8_COMP_common_ 1.53 + ldr r6, _VP8_COMMON_MBrows_ 1.54 + add r4, r0, r4 1.55 + 1.56 + ldr r5, [r4, r6] ; load up mb_rows 1.57 + 1.58 + str r2, [sp, #20] ; save vp8_coef_encodings 1.59 + str r5, [sp, #12] ; save mb_rows 1.60 + str r3, [sp, #8] ; save vp8_extra_bits 1.61 + 1.62 + ldr r4, _VP8_COMP_tplist_ 1.63 + add r4, r0, r4 1.64 + ldr r7, [r4, #0] ; dereference cpi->tp_list 1.65 + 1.66 + mov r0, r1 ; keep same as other loops 1.67 + 1.68 + ldr r2, [r0, #vp8_writer_lowvalue] 1.69 + ldr r5, [r0, #vp8_writer_range] 1.70 + ldr r3, [r0, #vp8_writer_count] 1.71 + 1.72 +mb_row_loop 1.73 + 1.74 + ldr r1, [r7, #tokenlist_start] 1.75 + ldr r9, [r7, #tokenlist_stop] 1.76 + str r9, [sp, #0] ; save stop for later comparison 1.77 + str r7, [sp, #16] ; tokenlist address for next time 1.78 + 1.79 + b check_p_lt_stop 1.80 + 1.81 + ; actuall work gets done here! 1.82 + 1.83 +while_p_lt_stop 1.84 + ldrb r6, [r1, #tokenextra_token] ; t 1.85 + ldr r4, [sp, #20] ; vp8_coef_encodings 1.86 + mov lr, #0 1.87 + add r4, r4, r6, lsl #3 ; a = vp8_coef_encodings + t 1.88 + ldr r9, [r1, #tokenextra_context_tree] ; pp 1.89 + 1.90 + ldrb r7, [r1, #tokenextra_skip_eob_node] 1.91 + 1.92 + ldr r6, [r4, #vp8_token_value] ; v 1.93 + ldr r8, [r4, #vp8_token_len] ; n 1.94 + 1.95 + ; vp8 specific skip_eob_node 1.96 + cmp r7, #0 1.97 + movne lr, #2 ; i = 2 1.98 + subne r8, r8, #1 ; --n 1.99 + 1.100 + rsb r4, r8, #32 ; 32-n 1.101 + ldr r10, [sp, #64] ; vp8_coef_tree 1.102 + 1.103 + ; v is kept in r12 during the token pack loop 1.104 + lsl r12, r6, r4 ; r12 = v << 32 - n 1.105 + 1.106 +; loop start 1.107 +token_loop 1.108 + ldrb r4, [r9, lr, asr #1] ; pp [i>>1] 1.109 + sub r7, r5, #1 ; range-1 1.110 + 1.111 + ; Decisions are made based on the bit value shifted 1.112 + ; off of v, so set a flag here based on this. 1.113 + ; This value is refered to as "bb" 1.114 + lsls r12, r12, #1 ; bb = v >> n 1.115 + mul r6, r4, r7 ; ((range-1) * pp[i>>1])) 1.116 + 1.117 + ; bb can only be 0 or 1. So only execute this statement 1.118 + ; if bb == 1, otherwise it will act like i + 0 1.119 + addcs lr, lr, #1 ; i + bb 1.120 + 1.121 + mov r7, #1 1.122 + ldrsb lr, [r10, lr] ; i = vp8_coef_tree[i+bb] 1.123 + add r4, r7, r6, lsr #8 ; 1 + (((range-1) * pp[i>>1]) >> 8) 1.124 + 1.125 + addcs r2, r2, r4 ; if (bb) lowvalue += split 1.126 + subcs r4, r5, r4 ; if (bb) range = range-split 1.127 + 1.128 + ; Counting the leading zeros is used to normalize range. 1.129 + clz r6, r4 1.130 + sub r6, r6, #24 ; shift 1.131 + 1.132 + ; Flag is set on the sum of count. This flag is used later 1.133 + ; to determine if count >= 0 1.134 + adds r3, r3, r6 ; count += shift 1.135 + lsl r5, r4, r6 ; range <<= shift 1.136 + bmi token_count_lt_zero ; if(count >= 0) 1.137 + 1.138 + sub r6, r6, r3 ; offset = shift - count 1.139 + sub r4, r6, #1 ; offset-1 1.140 + lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 ) 1.141 + bpl token_high_bit_not_set 1.142 + 1.143 + ldr r4, [r0, #vp8_writer_pos] ; x 1.144 + sub r4, r4, #1 ; x = w->pos-1 1.145 + b token_zero_while_start 1.146 +token_zero_while_loop 1.147 + mov r10, #0 1.148 + strb r10, [r7, r4] ; w->buffer[x] =(unsigned char)0 1.149 + sub r4, r4, #1 ; x-- 1.150 +token_zero_while_start 1.151 + cmp r4, #0 1.152 + ldrge r7, [r0, #vp8_writer_buffer] 1.153 + ldrb r11, [r7, r4] 1.154 + cmpge r11, #0xff 1.155 + beq token_zero_while_loop 1.156 + 1.157 + ldr r7, [r0, #vp8_writer_buffer] 1.158 + ldrb r10, [r7, r4] ; w->buffer[x] 1.159 + add r10, r10, #1 1.160 + strb r10, [r7, r4] ; w->buffer[x] + 1 1.161 +token_high_bit_not_set 1.162 + rsb r4, r6, #24 ; 24-offset 1.163 + ldr r10, [r0, #vp8_writer_buffer] 1.164 + lsr r7, r2, r4 ; lowvalue >> (24-offset) 1.165 + ldr r4, [r0, #vp8_writer_pos] ; w->pos 1.166 + lsl r2, r2, r6 ; lowvalue <<= offset 1.167 + mov r6, r3 ; shift = count 1.168 + add r11, r4, #1 ; w->pos++ 1.169 + bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff 1.170 + str r11, [r0, #vp8_writer_pos] 1.171 + sub r3, r3, #8 ; count -= 8 1.172 + 1.173 + VALIDATE_POS r10, r11 ; validate_buffer at pos 1.174 + 1.175 + strb r7, [r10, r4] ; w->buffer[w->pos++] 1.176 + 1.177 + ; r10 is used earlier in the loop, but r10 is used as 1.178 + ; temp variable here. So after r10 is used, reload 1.179 + ; vp8_coef_tree_dcd into r10 1.180 + ldr r10, [sp, #64] ; vp8_coef_tree 1.181 + 1.182 +token_count_lt_zero 1.183 + lsl r2, r2, r6 ; lowvalue <<= shift 1.184 + 1.185 + subs r8, r8, #1 ; --n 1.186 + bne token_loop 1.187 + 1.188 + ldrb r6, [r1, #tokenextra_token] ; t 1.189 + ldr r7, [sp, #8] ; vp8_extra_bits 1.190 + ; Add t * sizeof (vp8_extra_bit_struct) to get the desired 1.191 + ; element. Here vp8_extra_bit_struct == 16 1.192 + add r12, r7, r6, lsl #4 ; b = vp8_extra_bits + t 1.193 + 1.194 + ldr r4, [r12, #vp8_extra_bit_struct_base_val] 1.195 + cmp r4, #0 1.196 + beq skip_extra_bits 1.197 + 1.198 +; if( b->base_val) 1.199 + ldr r8, [r12, #vp8_extra_bit_struct_len] ; L 1.200 + ldrsh lr, [r1, #tokenextra_extra] ; e = p->Extra 1.201 + cmp r8, #0 ; if( L) 1.202 + beq no_extra_bits 1.203 + 1.204 + ldr r9, [r12, #vp8_extra_bit_struct_prob] 1.205 + asr r7, lr, #1 ; v=e>>1 1.206 + 1.207 + ldr r10, [r12, #vp8_extra_bit_struct_tree] 1.208 + str r10, [sp, #4] ; b->tree 1.209 + 1.210 + rsb r4, r8, #32 1.211 + lsl r12, r7, r4 1.212 + 1.213 + mov lr, #0 ; i = 0 1.214 + 1.215 +extra_bits_loop 1.216 + ldrb r4, [r9, lr, asr #1] ; pp[i>>1] 1.217 + sub r7, r5, #1 ; range-1 1.218 + lsls r12, r12, #1 ; v >> n 1.219 + mul r6, r4, r7 ; (range-1) * pp[i>>1] 1.220 + addcs lr, lr, #1 ; i + bb 1.221 + 1.222 + mov r7, #1 1.223 + ldrsb lr, [r10, lr] ; i = b->tree[i+bb] 1.224 + add r4, r7, r6, lsr #8 ; split = 1 + (((range-1) * pp[i>>1]) >> 8) 1.225 + 1.226 + addcs r2, r2, r4 ; if (bb) lowvalue += split 1.227 + subcs r4, r5, r4 ; if (bb) range = range-split 1.228 + 1.229 + clz r6, r4 1.230 + sub r6, r6, #24 1.231 + 1.232 + adds r3, r3, r6 ; count += shift 1.233 + lsl r5, r4, r6 ; range <<= shift 1.234 + bmi extra_count_lt_zero ; if(count >= 0) 1.235 + 1.236 + sub r6, r6, r3 ; offset= shift - count 1.237 + sub r4, r6, #1 ; offset-1 1.238 + lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 ) 1.239 + bpl extra_high_bit_not_set 1.240 + 1.241 + ldr r4, [r0, #vp8_writer_pos] ; x 1.242 + sub r4, r4, #1 ; x = w->pos - 1 1.243 + b extra_zero_while_start 1.244 +extra_zero_while_loop 1.245 + mov r10, #0 1.246 + strb r10, [r7, r4] ; w->buffer[x] =(unsigned char)0 1.247 + sub r4, r4, #1 ; x-- 1.248 +extra_zero_while_start 1.249 + cmp r4, #0 1.250 + ldrge r7, [r0, #vp8_writer_buffer] 1.251 + ldrb r11, [r7, r4] 1.252 + cmpge r11, #0xff 1.253 + beq extra_zero_while_loop 1.254 + 1.255 + ldr r7, [r0, #vp8_writer_buffer] 1.256 + ldrb r10, [r7, r4] 1.257 + add r10, r10, #1 1.258 + strb r10, [r7, r4] 1.259 +extra_high_bit_not_set 1.260 + rsb r4, r6, #24 ; 24-offset 1.261 + ldr r10, [r0, #vp8_writer_buffer] 1.262 + lsr r7, r2, r4 ; lowvalue >> (24-offset) 1.263 + ldr r4, [r0, #vp8_writer_pos] 1.264 + lsl r2, r2, r6 ; lowvalue <<= offset 1.265 + mov r6, r3 ; shift = count 1.266 + add r11, r4, #1 ; w->pos++ 1.267 + bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff 1.268 + str r11, [r0, #vp8_writer_pos] 1.269 + sub r3, r3, #8 ; count -= 8 1.270 + 1.271 + VALIDATE_POS r10, r11 ; validate_buffer at pos 1.272 + 1.273 + strb r7, [r10, r4] ; w->buffer[w->pos++]=(lowvalue >> (24-offset)) 1.274 + ldr r10, [sp, #4] ; b->tree 1.275 +extra_count_lt_zero 1.276 + lsl r2, r2, r6 1.277 + 1.278 + subs r8, r8, #1 ; --n 1.279 + bne extra_bits_loop ; while (n) 1.280 + 1.281 +no_extra_bits 1.282 + ldr lr, [r1, #4] ; e = p->Extra 1.283 + add r4, r5, #1 ; range + 1 1.284 + tst lr, #1 1.285 + lsr r4, r4, #1 ; split = (range + 1) >> 1 1.286 + addne r2, r2, r4 ; lowvalue += split 1.287 + subne r4, r5, r4 ; range = range-split 1.288 + tst r2, #0x80000000 ; lowvalue & 0x80000000 1.289 + lsl r5, r4, #1 ; range <<= 1 1.290 + beq end_high_bit_not_set 1.291 + 1.292 + ldr r4, [r0, #vp8_writer_pos] 1.293 + mov r7, #0 1.294 + sub r4, r4, #1 1.295 + b end_zero_while_start 1.296 +end_zero_while_loop 1.297 + strb r7, [r6, r4] 1.298 + sub r4, r4, #1 ; x-- 1.299 +end_zero_while_start 1.300 + cmp r4, #0 1.301 + ldrge r6, [r0, #vp8_writer_buffer] 1.302 + ldrb r12, [r6, r4] 1.303 + cmpge r12, #0xff 1.304 + beq end_zero_while_loop 1.305 + 1.306 + ldr r6, [r0, #vp8_writer_buffer] 1.307 + ldrb r7, [r6, r4] 1.308 + add r7, r7, #1 1.309 + strb r7, [r6, r4] 1.310 +end_high_bit_not_set 1.311 + adds r3, r3, #1 ; ++count 1.312 + lsl r2, r2, #1 ; lowvalue <<= 1 1.313 + bne end_count_zero 1.314 + 1.315 + ldr r4, [r0, #vp8_writer_pos] 1.316 + mvn r3, #7 1.317 + ldr r7, [r0, #vp8_writer_buffer] 1.318 + lsr r6, r2, #24 ; lowvalue >> 24 1.319 + add r12, r4, #1 ; w->pos++ 1.320 + bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff 1.321 + str r12, [r0, #vp8_writer_pos] 1.322 + 1.323 + VALIDATE_POS r7, r12 ; validate_buffer at pos 1.324 + 1.325 + strb r6, [r7, r4] 1.326 +end_count_zero 1.327 +skip_extra_bits 1.328 + add r1, r1, #TOKENEXTRA_SZ ; ++p 1.329 +check_p_lt_stop 1.330 + ldr r4, [sp, #0] ; stop 1.331 + cmp r1, r4 ; while( p < stop) 1.332 + bcc while_p_lt_stop 1.333 + 1.334 + ldr r6, [sp, #12] ; mb_rows 1.335 + ldr r7, [sp, #16] ; tokenlist address 1.336 + subs r6, r6, #1 1.337 + add r7, r7, #TOKENLIST_SZ ; next element in the array 1.338 + str r6, [sp, #12] 1.339 + bne mb_row_loop 1.340 + 1.341 + str r2, [r0, #vp8_writer_lowvalue] 1.342 + str r5, [r0, #vp8_writer_range] 1.343 + str r3, [r0, #vp8_writer_count] 1.344 + add sp, sp, #24 1.345 + pop {r4-r12, pc} 1.346 + ENDP 1.347 + 1.348 +_VP8_COMP_common_ 1.349 + DCD vp8_comp_common 1.350 +_VP8_COMMON_MBrows_ 1.351 + DCD vp8_common_mb_rows 1.352 +_VP8_COMP_tplist_ 1.353 + DCD vp8_comp_tplist 1.354 + 1.355 + END