1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/media/libvpx/vp8/encoder/arm/armv5te/vp8_packtokens_partitions_armv5.asm Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,471 @@ 1.4 +; 1.5 +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 1.6 +; 1.7 +; Use of this source code is governed by a BSD-style license 1.8 +; that can be found in the LICENSE file in the root of the source 1.9 +; tree. An additional intellectual property rights grant can be found 1.10 +; in the file PATENTS. All contributing project authors may 1.11 +; be found in the AUTHORS file in the root of the source tree. 1.12 +; 1.13 + 1.14 + 1.15 + EXPORT |vp8cx_pack_tokens_into_partitions_armv5| 1.16 + IMPORT |vp8_validate_buffer_arm| 1.17 + 1.18 + INCLUDE vp8_asm_enc_offsets.asm 1.19 + 1.20 + ARM 1.21 + REQUIRE8 1.22 + PRESERVE8 1.23 + 1.24 + AREA |.text|, CODE, READONLY 1.25 + 1.26 + ; macro for validating write buffer position 1.27 + ; needs vp8_writer in r0 1.28 + ; start shall not be in r1 1.29 + MACRO 1.30 + VALIDATE_POS $start, $pos 1.31 + push {r0-r3, r12, lr} ; rest of regs are preserved by subroutine call 1.32 + ldr r2, [r0, #vp8_writer_buffer_end] 1.33 + ldr r3, [r0, #vp8_writer_error] 1.34 + mov r1, $pos 1.35 + mov r0, $start 1.36 + bl vp8_validate_buffer_arm 1.37 + pop {r0-r3, r12, lr} 1.38 + MEND 1.39 + 1.40 +; r0 VP8_COMP *cpi 1.41 +; r1 unsigned char *cx_data 1.42 +; r2 const unsigned char *cx_data_end 1.43 +; r3 int num_part 1.44 +; s0 vp8_coef_encodings 1.45 +; s1 vp8_extra_bits, 1.46 +; s2 const vp8_tree_index * 1.47 + 1.48 +|vp8cx_pack_tokens_into_partitions_armv5| PROC 1.49 + push {r4-r12, lr} 1.50 + sub sp, sp, #40 1.51 + 1.52 + ; Compute address of cpi->common.mb_rows 1.53 + ldr r4, _VP8_COMP_common_ 1.54 + ldr r6, _VP8_COMMON_MBrows_ 1.55 + add r4, r0, r4 1.56 + 1.57 + ldr r5, [r4, r6] ; load up mb_rows 1.58 + 1.59 + str r5, [sp, #36] ; save mb_rows 1.60 + str r1, [sp, #24] ; save ptr = cx_data 1.61 + str r3, [sp, #20] ; save num_part 1.62 + str r2, [sp, #8] ; save cx_data_end 1.63 + 1.64 + ldr r4, _VP8_COMP_tplist_ 1.65 + add r4, r0, r4 1.66 + ldr r7, [r4, #0] ; dereference cpi->tp_list 1.67 + str r7, [sp, #32] ; store start of cpi->tp_list 1.68 + 1.69 + ldr r11, _VP8_COMP_bc_ ; load up vp8_writer out of cpi 1.70 + add r0, r0, r11 1.71 + 1.72 + mov r11, #0 1.73 + str r11, [sp, #28] ; i 1.74 + 1.75 +numparts_loop 1.76 + ldr r2, _vp8_writer_sz_ ; load up sizeof(vp8_writer) 1.77 + add r0, r2 ; bc[i + 1] 1.78 + 1.79 + ldr r10, [sp, #24] ; ptr 1.80 + ldr r5, [sp, #36] ; move mb_rows to the counting section 1.81 + subs r5, r5, r11 ; move start point with each partition 1.82 + ; mb_rows starts at i 1.83 + str r5, [sp, #12] 1.84 + 1.85 + ; Reset all of the VP8 Writer data for each partition that 1.86 + ; is processed. 1.87 + ; start_encode 1.88 + 1.89 + ldr r3, [sp, #8] 1.90 + str r3, [r0, #vp8_writer_buffer_end] 1.91 + 1.92 + mov r2, #0 ; vp8_writer_lowvalue 1.93 + mov r5, #255 ; vp8_writer_range 1.94 + mvn r3, #23 ; vp8_writer_count 1.95 + 1.96 + str r2, [r0, #vp8_writer_pos] 1.97 + str r10, [r0, #vp8_writer_buffer] 1.98 + 1.99 + ble end_partition ; if (mb_rows <= 0) end partition 1.100 + 1.101 +mb_row_loop 1.102 + 1.103 + ldr r1, [r7, #tokenlist_start] 1.104 + ldr r9, [r7, #tokenlist_stop] 1.105 + str r9, [sp, #0] ; save stop for later comparison 1.106 + str r7, [sp, #16] ; tokenlist address for next time 1.107 + 1.108 + b check_p_lt_stop 1.109 + 1.110 + ; actual work gets done here! 1.111 + 1.112 +while_p_lt_stop 1.113 + ldrb r6, [r1, #tokenextra_token] ; t 1.114 + ldr r4, [sp, #80] ; vp8_coef_encodings 1.115 + mov lr, #0 1.116 + add r4, r4, r6, lsl #3 ; a = vp8_coef_encodings + t 1.117 + ldr r9, [r1, #tokenextra_context_tree] ; pp 1.118 + 1.119 + ldrb r7, [r1, #tokenextra_skip_eob_node] 1.120 + 1.121 + ldr r6, [r4, #vp8_token_value] ; v 1.122 + ldr r8, [r4, #vp8_token_len] ; n 1.123 + 1.124 + ; vp8 specific skip_eob_node 1.125 + cmp r7, #0 1.126 + movne lr, #2 ; i = 2 1.127 + subne r8, r8, #1 ; --n 1.128 + 1.129 + rsb r4, r8, #32 ; 32-n 1.130 + ldr r10, [sp, #88] ; vp8_coef_tree 1.131 + 1.132 + ; v is kept in r12 during the token pack loop 1.133 + lsl r12, r6, r4 ; r12 = v << 32 - n 1.134 + 1.135 +; loop start 1.136 +token_loop 1.137 + ldrb r4, [r9, lr, asr #1] ; pp [i>>1] 1.138 + sub r7, r5, #1 ; range-1 1.139 + 1.140 + ; Decisions are made based on the bit value shifted 1.141 + ; off of v, so set a flag here based on this. 1.142 + ; This value is refered to as "bb" 1.143 + lsls r12, r12, #1 ; bb = v >> n 1.144 + mul r6, r4, r7 ; ((range-1) * pp[i>>1])) 1.145 + 1.146 + ; bb can only be 0 or 1. So only execute this statement 1.147 + ; if bb == 1, otherwise it will act like i + 0 1.148 + addcs lr, lr, #1 ; i + bb 1.149 + 1.150 + mov r7, #1 1.151 + ldrsb lr, [r10, lr] ; i = vp8_coef_tree[i+bb] 1.152 + add r4, r7, r6, lsr #8 ; 1 + (((range-1) * pp[i>>1]) >> 8) 1.153 + 1.154 + addcs r2, r2, r4 ; if (bb) lowvalue += split 1.155 + subcs r4, r5, r4 ; if (bb) range = range-split 1.156 + 1.157 + ; Counting the leading zeros is used to normalize range. 1.158 + clz r6, r4 1.159 + sub r6, r6, #24 ; shift 1.160 + 1.161 + ; Flag is set on the sum of count. This flag is used later 1.162 + ; to determine if count >= 0 1.163 + adds r3, r3, r6 ; count += shift 1.164 + lsl r5, r4, r6 ; range <<= shift 1.165 + bmi token_count_lt_zero ; if(count >= 0) 1.166 + 1.167 + sub r6, r6, r3 ; offset = shift - count 1.168 + sub r4, r6, #1 ; offset-1 1.169 + lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 ) 1.170 + bpl token_high_bit_not_set 1.171 + 1.172 + ldr r4, [r0, #vp8_writer_pos] ; x 1.173 + sub r4, r4, #1 ; x = w->pos-1 1.174 + b token_zero_while_start 1.175 +token_zero_while_loop 1.176 + mov r10, #0 1.177 + strb r10, [r7, r4] ; w->buffer[x] =(unsigned char)0 1.178 + sub r4, r4, #1 ; x-- 1.179 +token_zero_while_start 1.180 + cmp r4, #0 1.181 + ldrge r7, [r0, #vp8_writer_buffer] 1.182 + ldrb r11, [r7, r4] 1.183 + cmpge r11, #0xff 1.184 + beq token_zero_while_loop 1.185 + 1.186 + ldr r7, [r0, #vp8_writer_buffer] 1.187 + ldrb r10, [r7, r4] ; w->buffer[x] 1.188 + add r10, r10, #1 1.189 + strb r10, [r7, r4] ; w->buffer[x] + 1 1.190 +token_high_bit_not_set 1.191 + rsb r4, r6, #24 ; 24-offset 1.192 + ldr r10, [r0, #vp8_writer_buffer] 1.193 + lsr r7, r2, r4 ; lowvalue >> (24-offset) 1.194 + ldr r4, [r0, #vp8_writer_pos] ; w->pos 1.195 + lsl r2, r2, r6 ; lowvalue <<= offset 1.196 + mov r6, r3 ; shift = count 1.197 + add r11, r4, #1 ; w->pos++ 1.198 + bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff 1.199 + str r11, [r0, #vp8_writer_pos] 1.200 + sub r3, r3, #8 ; count -= 8 1.201 + 1.202 + VALIDATE_POS r10, r11 ; validate_buffer at pos 1.203 + 1.204 + strb r7, [r10, r4] ; w->buffer[w->pos++] 1.205 + 1.206 + ; r10 is used earlier in the loop, but r10 is used as 1.207 + ; temp variable here. So after r10 is used, reload 1.208 + ; vp8_coef_tree_dcd into r10 1.209 + ldr r10, [sp, #88] ; vp8_coef_tree 1.210 + 1.211 +token_count_lt_zero 1.212 + lsl r2, r2, r6 ; lowvalue <<= shift 1.213 + 1.214 + subs r8, r8, #1 ; --n 1.215 + bne token_loop 1.216 + 1.217 + ldrb r6, [r1, #tokenextra_token] ; t 1.218 + ldr r7, [sp, #84] ; vp8_extra_bits 1.219 + ; Add t * sizeof (vp8_extra_bit_struct) to get the desired 1.220 + ; element. Here vp8_extra_bit_struct == 16 1.221 + add r12, r7, r6, lsl #4 ; b = vp8_extra_bits + t 1.222 + 1.223 + ldr r4, [r12, #vp8_extra_bit_struct_base_val] 1.224 + cmp r4, #0 1.225 + beq skip_extra_bits 1.226 + 1.227 +; if( b->base_val) 1.228 + ldr r8, [r12, #vp8_extra_bit_struct_len] ; L 1.229 + ldrsh lr, [r1, #tokenextra_extra] ; e = p->Extra 1.230 + cmp r8, #0 ; if( L) 1.231 + beq no_extra_bits 1.232 + 1.233 + ldr r9, [r12, #vp8_extra_bit_struct_prob] 1.234 + asr r7, lr, #1 ; v=e>>1 1.235 + 1.236 + ldr r10, [r12, #vp8_extra_bit_struct_tree] 1.237 + str r10, [sp, #4] ; b->tree 1.238 + 1.239 + rsb r4, r8, #32 1.240 + lsl r12, r7, r4 1.241 + 1.242 + mov lr, #0 ; i = 0 1.243 + 1.244 +extra_bits_loop 1.245 + ldrb r4, [r9, lr, asr #1] ; pp[i>>1] 1.246 + sub r7, r5, #1 ; range-1 1.247 + lsls r12, r12, #1 ; v >> n 1.248 + mul r6, r4, r7 ; (range-1) * pp[i>>1] 1.249 + addcs lr, lr, #1 ; i + bb 1.250 + 1.251 + mov r7, #1 1.252 + ldrsb lr, [r10, lr] ; i = b->tree[i+bb] 1.253 + add r4, r7, r6, lsr #8 ; split = 1 + (((range-1) * pp[i>>1]) >> 8) 1.254 + 1.255 + addcs r2, r2, r4 ; if (bb) lowvalue += split 1.256 + subcs r4, r5, r4 ; if (bb) range = range-split 1.257 + 1.258 + clz r6, r4 1.259 + sub r6, r6, #24 1.260 + 1.261 + adds r3, r3, r6 ; count += shift 1.262 + lsl r5, r4, r6 ; range <<= shift 1.263 + bmi extra_count_lt_zero ; if(count >= 0) 1.264 + 1.265 + sub r6, r6, r3 ; offset= shift - count 1.266 + sub r4, r6, #1 ; offset-1 1.267 + lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 ) 1.268 + bpl extra_high_bit_not_set 1.269 + 1.270 + ldr r4, [r0, #vp8_writer_pos] ; x 1.271 + sub r4, r4, #1 ; x = w->pos - 1 1.272 + b extra_zero_while_start 1.273 +extra_zero_while_loop 1.274 + mov r10, #0 1.275 + strb r10, [r7, r4] ; w->buffer[x] =(unsigned char)0 1.276 + sub r4, r4, #1 ; x-- 1.277 +extra_zero_while_start 1.278 + cmp r4, #0 1.279 + ldrge r7, [r0, #vp8_writer_buffer] 1.280 + ldrb r11, [r7, r4] 1.281 + cmpge r11, #0xff 1.282 + beq extra_zero_while_loop 1.283 + 1.284 + ldr r7, [r0, #vp8_writer_buffer] 1.285 + ldrb r10, [r7, r4] 1.286 + add r10, r10, #1 1.287 + strb r10, [r7, r4] 1.288 +extra_high_bit_not_set 1.289 + rsb r4, r6, #24 ; 24-offset 1.290 + ldr r10, [r0, #vp8_writer_buffer] 1.291 + lsr r7, r2, r4 ; lowvalue >> (24-offset) 1.292 + ldr r4, [r0, #vp8_writer_pos] 1.293 + lsl r2, r2, r6 ; lowvalue <<= offset 1.294 + mov r6, r3 ; shift = count 1.295 + add r11, r4, #1 ; w->pos++ 1.296 + bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff 1.297 + str r11, [r0, #vp8_writer_pos] 1.298 + sub r3, r3, #8 ; count -= 8 1.299 + 1.300 + VALIDATE_POS r10, r11 ; validate_buffer at pos 1.301 + 1.302 + strb r7, [r10, r4] ; w->buffer[w->pos++]=(lowvalue >> (24-offset)) 1.303 + ldr r10, [sp, #4] ; b->tree 1.304 +extra_count_lt_zero 1.305 + lsl r2, r2, r6 1.306 + 1.307 + subs r8, r8, #1 ; --n 1.308 + bne extra_bits_loop ; while (n) 1.309 + 1.310 +no_extra_bits 1.311 + ldr lr, [r1, #4] ; e = p->Extra 1.312 + add r4, r5, #1 ; range + 1 1.313 + tst lr, #1 1.314 + lsr r4, r4, #1 ; split = (range + 1) >> 1 1.315 + addne r2, r2, r4 ; lowvalue += split 1.316 + subne r4, r5, r4 ; range = range-split 1.317 + tst r2, #0x80000000 ; lowvalue & 0x80000000 1.318 + lsl r5, r4, #1 ; range <<= 1 1.319 + beq end_high_bit_not_set 1.320 + 1.321 + ldr r4, [r0, #vp8_writer_pos] 1.322 + mov r7, #0 1.323 + sub r4, r4, #1 1.324 + b end_zero_while_start 1.325 +end_zero_while_loop 1.326 + strb r7, [r6, r4] 1.327 + sub r4, r4, #1 ; x-- 1.328 +end_zero_while_start 1.329 + cmp r4, #0 1.330 + ldrge r6, [r0, #vp8_writer_buffer] 1.331 + ldrb r12, [r6, r4] 1.332 + cmpge r12, #0xff 1.333 + beq end_zero_while_loop 1.334 + 1.335 + ldr r6, [r0, #vp8_writer_buffer] 1.336 + ldrb r7, [r6, r4] 1.337 + add r7, r7, #1 1.338 + strb r7, [r6, r4] 1.339 +end_high_bit_not_set 1.340 + adds r3, r3, #1 ; ++count 1.341 + lsl r2, r2, #1 ; lowvalue <<= 1 1.342 + bne end_count_zero 1.343 + 1.344 + ldr r4, [r0, #vp8_writer_pos] 1.345 + mvn r3, #7 ; count = -8 1.346 + ldr r7, [r0, #vp8_writer_buffer] 1.347 + lsr r6, r2, #24 ; lowvalue >> 24 1.348 + add r12, r4, #1 ; w->pos++ 1.349 + bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff 1.350 + str r12, [r0, #vp8_writer_pos] 1.351 + 1.352 + VALIDATE_POS r7, r12 ; validate_buffer at pos 1.353 + 1.354 + strb r6, [r7, r4] 1.355 +end_count_zero 1.356 +skip_extra_bits 1.357 + add r1, r1, #TOKENEXTRA_SZ ; ++p 1.358 +check_p_lt_stop 1.359 + ldr r4, [sp, #0] ; stop 1.360 + cmp r1, r4 ; while( p < stop) 1.361 + bcc while_p_lt_stop 1.362 + 1.363 + ldr r10, [sp, #20] ; num_parts 1.364 + mov r1, #TOKENLIST_SZ 1.365 + mul r1, r10, r1 1.366 + 1.367 + ldr r6, [sp, #12] ; mb_rows 1.368 + ldr r7, [sp, #16] ; tokenlist address 1.369 + subs r6, r6, r10 1.370 + add r7, r7, r1 ; next element in the array 1.371 + str r6, [sp, #12] 1.372 + bgt mb_row_loop 1.373 + 1.374 +end_partition 1.375 + mov r12, #32 1.376 + 1.377 +stop_encode_loop 1.378 + sub r7, r5, #1 ; range-1 1.379 + 1.380 + mov r4, r7, lsl #7 ; ((range-1) * 128) 1.381 + 1.382 + mov r7, #1 1.383 + add r4, r7, r4, lsr #8 ; 1 + (((range-1) * 128) >> 8) 1.384 + 1.385 + ; Counting the leading zeros is used to normalize range. 1.386 + clz r6, r4 1.387 + sub r6, r6, #24 ; shift 1.388 + 1.389 + ; Flag is set on the sum of count. This flag is used later 1.390 + ; to determine if count >= 0 1.391 + adds r3, r3, r6 ; count += shift 1.392 + lsl r5, r4, r6 ; range <<= shift 1.393 + bmi token_count_lt_zero_se ; if(count >= 0) 1.394 + 1.395 + sub r6, r6, r3 ; offset = shift - count 1.396 + sub r4, r6, #1 ; offset-1 1.397 + lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 ) 1.398 + bpl token_high_bit_not_set_se 1.399 + 1.400 + ldr r4, [r0, #vp8_writer_pos] ; x 1.401 + sub r4, r4, #1 ; x = w->pos-1 1.402 + b token_zero_while_start_se 1.403 +token_zero_while_loop_se 1.404 + mov r10, #0 1.405 + strb r10, [r7, r4] ; w->buffer[x] =(unsigned char)0 1.406 + sub r4, r4, #1 ; x-- 1.407 +token_zero_while_start_se 1.408 + cmp r4, #0 1.409 + ldrge r7, [r0, #vp8_writer_buffer] 1.410 + ldrb r11, [r7, r4] 1.411 + cmpge r11, #0xff 1.412 + beq token_zero_while_loop_se 1.413 + 1.414 + ldr r7, [r0, #vp8_writer_buffer] 1.415 + ldrb r10, [r7, r4] ; w->buffer[x] 1.416 + add r10, r10, #1 1.417 + strb r10, [r7, r4] ; w->buffer[x] + 1 1.418 +token_high_bit_not_set_se 1.419 + rsb r4, r6, #24 ; 24-offset 1.420 + ldr r10, [r0, #vp8_writer_buffer] 1.421 + lsr r7, r2, r4 ; lowvalue >> (24-offset) 1.422 + ldr r4, [r0, #vp8_writer_pos] ; w->pos 1.423 + lsl r2, r2, r6 ; lowvalue <<= offset 1.424 + mov r6, r3 ; shift = count 1.425 + add r11, r4, #1 ; w->pos++ 1.426 + bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff 1.427 + str r11, [r0, #vp8_writer_pos] 1.428 + sub r3, r3, #8 ; count -= 8 1.429 + 1.430 + VALIDATE_POS r10, r11 ; validate_buffer at pos 1.431 + 1.432 + strb r7, [r10, r4] ; w->buffer[w->pos++] 1.433 + 1.434 +token_count_lt_zero_se 1.435 + lsl r2, r2, r6 ; lowvalue <<= shift 1.436 + 1.437 + subs r12, r12, #1 1.438 + bne stop_encode_loop 1.439 + 1.440 + ldr r4, [r0, #vp8_writer_pos] ; w->pos 1.441 + ldr r12, [sp, #24] ; ptr 1.442 + add r12, r12, r4 ; ptr += w->pos 1.443 + str r12, [sp, #24] 1.444 + 1.445 + ldr r11, [sp, #28] ; i 1.446 + ldr r10, [sp, #20] ; num_parts 1.447 + 1.448 + add r11, r11, #1 ; i++ 1.449 + str r11, [sp, #28] 1.450 + 1.451 + ldr r7, [sp, #32] ; cpi->tp_list[i] 1.452 + mov r1, #TOKENLIST_SZ 1.453 + add r7, r7, r1 ; next element in cpi->tp_list 1.454 + str r7, [sp, #32] ; cpi->tp_list[i+1] 1.455 + 1.456 + cmp r10, r11 1.457 + bgt numparts_loop 1.458 + 1.459 + add sp, sp, #40 1.460 + pop {r4-r12, pc} 1.461 + ENDP 1.462 + 1.463 +_VP8_COMP_common_ 1.464 + DCD vp8_comp_common 1.465 +_VP8_COMMON_MBrows_ 1.466 + DCD vp8_common_mb_rows 1.467 +_VP8_COMP_tplist_ 1.468 + DCD vp8_comp_tplist 1.469 +_VP8_COMP_bc_ 1.470 + DCD vp8_comp_bc 1.471 +_vp8_writer_sz_ 1.472 + DCD vp8_writer_sz 1.473 + 1.474 + END