media/libvpx/vp8/encoder/arm/armv5te/vp8_packtokens_mbrow_armv5.asm

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/media/libvpx/vp8/encoder/arm/armv5te/vp8_packtokens_mbrow_armv5.asm	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,352 @@
     1.4 +;
     1.5 +;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
     1.6 +;
     1.7 +;  Use of this source code is governed by a BSD-style license
     1.8 +;  that can be found in the LICENSE file in the root of the source
     1.9 +;  tree. An additional intellectual property rights grant can be found
    1.10 +;  in the file PATENTS.  All contributing project authors may
    1.11 +;  be found in the AUTHORS file in the root of the source tree.
    1.12 +;
    1.13 +
    1.14 +
    1.15 +    EXPORT |vp8cx_pack_mb_row_tokens_armv5|
    1.16 +    IMPORT |vp8_validate_buffer_arm|
    1.17 +
    1.18 +    INCLUDE vp8_asm_enc_offsets.asm
    1.19 +
    1.20 +    ARM
    1.21 +    REQUIRE8
    1.22 +    PRESERVE8
    1.23 +
    1.24 +    AREA    |.text|, CODE, READONLY
    1.25 +
    1.26 +
    1.27 +    ; macro for validating write buffer position
    1.28 +    ; needs vp8_writer in r0
    1.29 +    ; start shall not be in r1
    1.30 +    MACRO
    1.31 +    VALIDATE_POS $start, $pos
    1.32 +    push {r0-r3, r12, lr}        ; rest of regs are preserved by subroutine call
    1.33 +    ldr  r2, [r0, #vp8_writer_buffer_end]
    1.34 +    ldr  r3, [r0, #vp8_writer_error]
    1.35 +    mov  r1, $pos
    1.36 +    mov  r0, $start
    1.37 +    bl   vp8_validate_buffer_arm
    1.38 +    pop  {r0-r3, r12, lr}
    1.39 +    MEND
    1.40 +
    1.41 +; r0 VP8_COMP *cpi
    1.42 +; r1 vp8_writer *w
    1.43 +; r2 vp8_coef_encodings
    1.44 +; r3 vp8_extra_bits
    1.45 +; s0 vp8_coef_tree
    1.46 +
    1.47 +|vp8cx_pack_mb_row_tokens_armv5| PROC
    1.48 +    push    {r4-r12, lr}
    1.49 +    sub     sp, sp, #24
    1.50 +
    1.51 +    ; Compute address of cpi->common.mb_rows
    1.52 +    ldr     r4, _VP8_COMP_common_
    1.53 +    ldr     r6, _VP8_COMMON_MBrows_
    1.54 +    add     r4, r0, r4
    1.55 +
    1.56 +    ldr     r5, [r4, r6]                ; load up mb_rows
    1.57 +
    1.58 +    str     r2, [sp, #20]               ; save vp8_coef_encodings
    1.59 +    str     r5, [sp, #12]               ; save mb_rows
    1.60 +    str     r3, [sp, #8]                ; save vp8_extra_bits
    1.61 +
    1.62 +    ldr     r4, _VP8_COMP_tplist_
    1.63 +    add     r4, r0, r4
    1.64 +    ldr     r7, [r4, #0]                ; dereference cpi->tp_list
    1.65 +
    1.66 +    mov     r0, r1                      ; keep same as other loops
    1.67 +
    1.68 +    ldr     r2, [r0, #vp8_writer_lowvalue]
    1.69 +    ldr     r5, [r0, #vp8_writer_range]
    1.70 +    ldr     r3, [r0, #vp8_writer_count]
    1.71 +
    1.72 +mb_row_loop
    1.73 +
    1.74 +    ldr     r1, [r7, #tokenlist_start]
    1.75 +    ldr     r9, [r7, #tokenlist_stop]
    1.76 +    str     r9, [sp, #0]                ; save stop for later comparison
    1.77 +    str     r7, [sp, #16]               ; tokenlist address for next time
    1.78 +
    1.79 +    b       check_p_lt_stop
    1.80 +
    1.81 +    ; actuall work gets done here!
    1.82 +
    1.83 +while_p_lt_stop
    1.84 +    ldrb    r6, [r1, #tokenextra_token] ; t
    1.85 +    ldr     r4, [sp, #20]               ; vp8_coef_encodings
    1.86 +    mov     lr, #0
    1.87 +    add     r4, r4, r6, lsl #3          ; a = vp8_coef_encodings + t
    1.88 +    ldr     r9, [r1, #tokenextra_context_tree]   ; pp
    1.89 +
    1.90 +    ldrb    r7, [r1, #tokenextra_skip_eob_node]
    1.91 +
    1.92 +    ldr     r6, [r4, #vp8_token_value]  ; v
    1.93 +    ldr     r8, [r4, #vp8_token_len]    ; n
    1.94 +
    1.95 +    ; vp8 specific skip_eob_node
    1.96 +    cmp     r7, #0
    1.97 +    movne   lr, #2                      ; i = 2
    1.98 +    subne   r8, r8, #1                  ; --n
    1.99 +
   1.100 +    rsb     r4, r8, #32                 ; 32-n
   1.101 +    ldr     r10, [sp, #64]              ; vp8_coef_tree
   1.102 +
   1.103 +    ; v is kept in r12 during the token pack loop
   1.104 +    lsl     r12, r6, r4                 ; r12 = v << 32 - n
   1.105 +
   1.106 +; loop start
   1.107 +token_loop
   1.108 +    ldrb    r4, [r9, lr, asr #1]        ; pp [i>>1]
   1.109 +    sub     r7, r5, #1                  ; range-1
   1.110 +
   1.111 +    ; Decisions are made based on the bit value shifted
   1.112 +    ; off of v, so set a flag here based on this.
   1.113 +    ; This value is refered to as "bb"
   1.114 +    lsls    r12, r12, #1                ; bb = v >> n
   1.115 +    mul     r6, r4, r7                  ; ((range-1) * pp[i>>1]))
   1.116 +
   1.117 +    ; bb can only be 0 or 1.  So only execute this statement
   1.118 +    ; if bb == 1, otherwise it will act like i + 0
   1.119 +    addcs   lr, lr, #1                  ; i + bb
   1.120 +
   1.121 +    mov     r7, #1
   1.122 +    ldrsb   lr, [r10, lr]               ; i = vp8_coef_tree[i+bb]
   1.123 +    add     r4, r7, r6, lsr #8          ; 1 + (((range-1) * pp[i>>1]) >> 8)
   1.124 +
   1.125 +    addcs   r2, r2, r4                  ; if  (bb) lowvalue += split
   1.126 +    subcs   r4, r5, r4                  ; if  (bb) range = range-split
   1.127 +
   1.128 +    ; Counting the leading zeros is used to normalize range.
   1.129 +    clz     r6, r4
   1.130 +    sub     r6, r6, #24                 ; shift
   1.131 +
   1.132 +    ; Flag is set on the sum of count.  This flag is used later
   1.133 +    ; to determine if count >= 0
   1.134 +    adds    r3, r3, r6                  ; count += shift
   1.135 +    lsl     r5, r4, r6                  ; range <<= shift
   1.136 +    bmi     token_count_lt_zero         ; if(count >= 0)
   1.137 +
   1.138 +    sub     r6, r6, r3                  ; offset = shift - count
   1.139 +    sub     r4, r6, #1                  ; offset-1
   1.140 +    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )
   1.141 +    bpl     token_high_bit_not_set
   1.142 +
   1.143 +    ldr     r4, [r0, #vp8_writer_pos]   ; x
   1.144 +    sub     r4, r4, #1                  ; x = w->pos-1
   1.145 +    b       token_zero_while_start
   1.146 +token_zero_while_loop
   1.147 +    mov     r10, #0
   1.148 +    strb    r10, [r7, r4]               ; w->buffer[x] =(unsigned char)0
   1.149 +    sub     r4, r4, #1                  ; x--
   1.150 +token_zero_while_start
   1.151 +    cmp     r4, #0
   1.152 +    ldrge   r7, [r0, #vp8_writer_buffer]
   1.153 +    ldrb    r11, [r7, r4]
   1.154 +    cmpge   r11, #0xff
   1.155 +    beq     token_zero_while_loop
   1.156 +
   1.157 +    ldr     r7, [r0, #vp8_writer_buffer]
   1.158 +    ldrb    r10, [r7, r4]               ; w->buffer[x]
   1.159 +    add     r10, r10, #1
   1.160 +    strb    r10, [r7, r4]               ; w->buffer[x] + 1
   1.161 +token_high_bit_not_set
   1.162 +    rsb     r4, r6, #24                 ; 24-offset
   1.163 +    ldr     r10, [r0, #vp8_writer_buffer]
   1.164 +    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)
   1.165 +    ldr     r4, [r0, #vp8_writer_pos]   ; w->pos
   1.166 +    lsl     r2, r2, r6                  ; lowvalue <<= offset
   1.167 +    mov     r6, r3                      ; shift = count
   1.168 +    add     r11, r4, #1                 ; w->pos++
   1.169 +    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
   1.170 +    str     r11, [r0, #vp8_writer_pos]
   1.171 +    sub     r3, r3, #8                  ; count -= 8
   1.172 +
   1.173 +    VALIDATE_POS r10, r11               ; validate_buffer at pos
   1.174 +
   1.175 +    strb    r7, [r10, r4]               ; w->buffer[w->pos++]
   1.176 +
   1.177 +    ; r10 is used earlier in the loop, but r10 is used as
   1.178 +    ; temp variable here.  So after r10 is used, reload
   1.179 +    ; vp8_coef_tree_dcd into r10
   1.180 +    ldr     r10, [sp, #64]              ; vp8_coef_tree
   1.181 +
   1.182 +token_count_lt_zero
   1.183 +    lsl     r2, r2, r6                  ; lowvalue <<= shift
   1.184 +
   1.185 +    subs    r8, r8, #1                  ; --n
   1.186 +    bne     token_loop
   1.187 +
   1.188 +    ldrb    r6, [r1, #tokenextra_token] ; t
   1.189 +    ldr     r7, [sp, #8]                ; vp8_extra_bits
   1.190 +    ; Add t * sizeof (vp8_extra_bit_struct) to get the desired
   1.191 +    ;  element.  Here vp8_extra_bit_struct == 16
   1.192 +    add     r12, r7, r6, lsl #4         ; b = vp8_extra_bits + t
   1.193 +
   1.194 +    ldr     r4, [r12, #vp8_extra_bit_struct_base_val]
   1.195 +    cmp     r4, #0
   1.196 +    beq     skip_extra_bits
   1.197 +
   1.198 +;   if( b->base_val)
   1.199 +    ldr     r8, [r12, #vp8_extra_bit_struct_len] ; L
   1.200 +    ldrsh   lr, [r1, #tokenextra_extra] ; e = p->Extra
   1.201 +    cmp     r8, #0                      ; if( L)
   1.202 +    beq     no_extra_bits
   1.203 +
   1.204 +    ldr     r9, [r12, #vp8_extra_bit_struct_prob]
   1.205 +    asr     r7, lr, #1                  ; v=e>>1
   1.206 +
   1.207 +    ldr     r10, [r12, #vp8_extra_bit_struct_tree]
   1.208 +    str     r10, [sp, #4]               ; b->tree
   1.209 +
   1.210 +    rsb     r4, r8, #32
   1.211 +    lsl     r12, r7, r4
   1.212 +
   1.213 +    mov     lr, #0                      ; i = 0
   1.214 +
   1.215 +extra_bits_loop
   1.216 +    ldrb    r4, [r9, lr, asr #1]            ; pp[i>>1]
   1.217 +    sub     r7, r5, #1                  ; range-1
   1.218 +    lsls    r12, r12, #1                ; v >> n
   1.219 +    mul     r6, r4, r7                  ; (range-1) * pp[i>>1]
   1.220 +    addcs   lr, lr, #1                  ; i + bb
   1.221 +
   1.222 +    mov     r7, #1
   1.223 +    ldrsb   lr, [r10, lr]               ; i = b->tree[i+bb]
   1.224 +    add     r4, r7, r6, lsr #8          ; split = 1 +  (((range-1) * pp[i>>1]) >> 8)
   1.225 +
   1.226 +    addcs   r2, r2, r4                  ; if  (bb) lowvalue += split
   1.227 +    subcs   r4, r5, r4                  ; if  (bb) range = range-split
   1.228 +
   1.229 +    clz     r6, r4
   1.230 +    sub     r6, r6, #24
   1.231 +
   1.232 +    adds    r3, r3, r6                  ; count += shift
   1.233 +    lsl     r5, r4, r6                  ; range <<= shift
   1.234 +    bmi     extra_count_lt_zero         ; if(count >= 0)
   1.235 +
   1.236 +    sub     r6, r6, r3                  ; offset= shift - count
   1.237 +    sub     r4, r6, #1                  ; offset-1
   1.238 +    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )
   1.239 +    bpl     extra_high_bit_not_set
   1.240 +
   1.241 +    ldr     r4, [r0, #vp8_writer_pos]   ; x
   1.242 +    sub     r4, r4, #1                  ; x = w->pos - 1
   1.243 +    b       extra_zero_while_start
   1.244 +extra_zero_while_loop
   1.245 +    mov     r10, #0
   1.246 +    strb    r10, [r7, r4]               ; w->buffer[x] =(unsigned char)0
   1.247 +    sub     r4, r4, #1                  ; x--
   1.248 +extra_zero_while_start
   1.249 +    cmp     r4, #0
   1.250 +    ldrge   r7, [r0, #vp8_writer_buffer]
   1.251 +    ldrb    r11, [r7, r4]
   1.252 +    cmpge   r11, #0xff
   1.253 +    beq     extra_zero_while_loop
   1.254 +
   1.255 +    ldr     r7, [r0, #vp8_writer_buffer]
   1.256 +    ldrb    r10, [r7, r4]
   1.257 +    add     r10, r10, #1
   1.258 +    strb    r10, [r7, r4]
   1.259 +extra_high_bit_not_set
   1.260 +    rsb     r4, r6, #24                 ; 24-offset
   1.261 +    ldr     r10, [r0, #vp8_writer_buffer]
   1.262 +    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)
   1.263 +    ldr     r4, [r0, #vp8_writer_pos]
   1.264 +    lsl     r2, r2, r6                  ; lowvalue <<= offset
   1.265 +    mov     r6, r3                      ; shift = count
   1.266 +    add     r11, r4, #1                 ; w->pos++
   1.267 +    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
   1.268 +    str     r11, [r0, #vp8_writer_pos]
   1.269 +    sub     r3, r3, #8                  ; count -= 8
   1.270 +
   1.271 +    VALIDATE_POS r10, r11               ; validate_buffer at pos
   1.272 +
   1.273 +    strb    r7, [r10, r4]               ; w->buffer[w->pos++]=(lowvalue >> (24-offset))
   1.274 +    ldr     r10, [sp, #4]               ; b->tree
   1.275 +extra_count_lt_zero
   1.276 +    lsl     r2, r2, r6
   1.277 +
   1.278 +    subs    r8, r8, #1                  ; --n
   1.279 +    bne     extra_bits_loop             ; while (n)
   1.280 +
   1.281 +no_extra_bits
   1.282 +    ldr     lr, [r1, #4]                ; e = p->Extra
   1.283 +    add     r4, r5, #1                  ; range + 1
   1.284 +    tst     lr, #1
   1.285 +    lsr     r4, r4, #1                  ; split = (range + 1) >> 1
   1.286 +    addne   r2, r2, r4                  ; lowvalue += split
   1.287 +    subne   r4, r5, r4                  ; range = range-split
   1.288 +    tst     r2, #0x80000000             ; lowvalue & 0x80000000
   1.289 +    lsl     r5, r4, #1                  ; range <<= 1
   1.290 +    beq     end_high_bit_not_set
   1.291 +
   1.292 +    ldr     r4, [r0, #vp8_writer_pos]
   1.293 +    mov     r7, #0
   1.294 +    sub     r4, r4, #1
   1.295 +    b       end_zero_while_start
   1.296 +end_zero_while_loop
   1.297 +    strb    r7, [r6, r4]
   1.298 +    sub     r4, r4, #1                  ; x--
   1.299 +end_zero_while_start
   1.300 +    cmp     r4, #0
   1.301 +    ldrge   r6, [r0, #vp8_writer_buffer]
   1.302 +    ldrb    r12, [r6, r4]
   1.303 +    cmpge   r12, #0xff
   1.304 +    beq     end_zero_while_loop
   1.305 +
   1.306 +    ldr     r6, [r0, #vp8_writer_buffer]
   1.307 +    ldrb    r7, [r6, r4]
   1.308 +    add     r7, r7, #1
   1.309 +    strb    r7, [r6, r4]
   1.310 +end_high_bit_not_set
   1.311 +    adds    r3, r3, #1                  ; ++count
   1.312 +    lsl     r2, r2, #1                  ; lowvalue  <<= 1
   1.313 +    bne     end_count_zero
   1.314 +
   1.315 +    ldr     r4, [r0, #vp8_writer_pos]
   1.316 +    mvn     r3, #7
   1.317 +    ldr     r7, [r0, #vp8_writer_buffer]
   1.318 +    lsr     r6, r2, #24                 ; lowvalue >> 24
   1.319 +    add     r12, r4, #1                 ; w->pos++
   1.320 +    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
   1.321 +    str     r12, [r0, #vp8_writer_pos]
   1.322 +
   1.323 +    VALIDATE_POS r7, r12               ; validate_buffer at pos
   1.324 +
   1.325 +    strb    r6, [r7, r4]
   1.326 +end_count_zero
   1.327 +skip_extra_bits
   1.328 +    add     r1, r1, #TOKENEXTRA_SZ      ; ++p
   1.329 +check_p_lt_stop
   1.330 +    ldr     r4, [sp, #0]                ; stop
   1.331 +    cmp     r1, r4                      ; while( p < stop)
   1.332 +    bcc     while_p_lt_stop
   1.333 +
   1.334 +    ldr     r6, [sp, #12]               ; mb_rows
   1.335 +    ldr     r7, [sp, #16]               ; tokenlist address
   1.336 +    subs    r6, r6, #1
   1.337 +    add     r7, r7, #TOKENLIST_SZ       ; next element in the array
   1.338 +    str     r6, [sp, #12]
   1.339 +    bne     mb_row_loop
   1.340 +
   1.341 +    str     r2, [r0, #vp8_writer_lowvalue]
   1.342 +    str     r5, [r0, #vp8_writer_range]
   1.343 +    str     r3, [r0, #vp8_writer_count]
   1.344 +    add     sp, sp, #24
   1.345 +    pop     {r4-r12, pc}
   1.346 +    ENDP
   1.347 +
   1.348 +_VP8_COMP_common_
   1.349 +    DCD     vp8_comp_common
   1.350 +_VP8_COMMON_MBrows_
   1.351 +    DCD     vp8_common_mb_rows
   1.352 +_VP8_COMP_tplist_
   1.353 +    DCD     vp8_comp_tplist
   1.354 +
   1.355 +    END

mercurial