media/libtheora/lib/arm/armfrag.s

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

     1 ;********************************************************************
     2 ;*                                                                  *
     3 ;* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
     4 ;* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
     5 ;* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
     6 ;* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
     7 ;*                                                                  *
     8 ;* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010                *
     9 ;* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
    10 ;*                                                                  *
    11 ;********************************************************************
    12 ; Original implementation:
    13 ;  Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd
    14 ; last mod: $Id: armfrag.s 17481 2010-10-03 22:49:42Z tterribe $
    15 ;********************************************************************
    17 	AREA	|.text|, CODE, READONLY
    19 	; Explicitly specifying alignment here because some versions of
    20 	; gas don't align code correctly. See
    21 	; http://lists.gnu.org/archive/html/bug-binutils/2011-06/msg00199.html
    22 	; https://bugzilla.mozilla.org/show_bug.cgi?id=920992
    23 	ALIGN
    25 	GET	armopts.s
    27 ; Vanilla ARM v4 versions
    28 	EXPORT	oc_frag_copy_list_arm
    29 	EXPORT	oc_frag_recon_intra_arm
    30 	EXPORT	oc_frag_recon_inter_arm
    31 	EXPORT	oc_frag_recon_inter2_arm
    33 oc_frag_copy_list_arm PROC
    34 	; r0 = _dst_frame
    35 	; r1 = _src_frame
    36 	; r2 = _ystride
    37 	; r3 = _fragis
    38 	; <> = _nfragis
    39 	; <> = _frag_buf_offs
    40 	LDR	r12,[r13]		; r12 = _nfragis
    41 	STMFD	r13!,{r4-r6,r11,r14}
    42 	SUBS	r12, r12, #1
    43 	LDR	r4,[r3],#4		; r4 = _fragis[fragii]
    44 	LDRGE	r14,[r13,#4*6]		; r14 = _frag_buf_offs
    45 	BLT	ofcl_arm_end
    46 	SUB	r2, r2, #4
    47 ofcl_arm_lp
    48 	LDR	r11,[r14,r4,LSL #2]	; r11 = _frag_buf_offs[_fragis[fragii]]
    49 	SUBS	r12, r12, #1
    50 	; Stall (on XScale)
    51 	ADD	r4, r1, r11		; r4 = _src_frame+frag_buf_off
    52 	LDR	r6, [r4], #4
    53 	ADD	r11,r0, r11		; r11 = _dst_frame+frag_buf_off
    54 	LDR	r5, [r4], r2
    55 	STR	r6, [r11],#4
    56 	LDR	r6, [r4], #4
    57 	STR	r5, [r11],r2
    58 	LDR	r5, [r4], r2
    59 	STR	r6, [r11],#4
    60 	LDR	r6, [r4], #4
    61 	STR	r5, [r11],r2
    62 	LDR	r5, [r4], r2
    63 	STR	r6, [r11],#4
    64 	LDR	r6, [r4], #4
    65 	STR	r5, [r11],r2
    66 	LDR	r5, [r4], r2
    67 	STR	r6, [r11],#4
    68 	LDR	r6, [r4], #4
    69 	STR	r5, [r11],r2
    70 	LDR	r5, [r4], r2
    71 	STR	r6, [r11],#4
    72 	LDR	r6, [r4], #4
    73 	STR	r5, [r11],r2
    74 	LDR	r5, [r4], r2
    75 	STR	r6, [r11],#4
    76 	LDR	r6, [r4], #4
    77 	STR	r5, [r11],r2
    78 	LDR	r5, [r4], r2
    79 	STR	r6, [r11],#4
    80 	LDR	r6, [r4], #4
    81 	STR	r5, [r11],r2
    82 	LDR	r5, [r4]
    83 	LDRGE	r4,[r3],#4		; r4 = _fragis[fragii]
    84 	STR	r6, [r11],#4
    85 	STR	r5, [r11]
    86 	BGE	ofcl_arm_lp
    87 ofcl_arm_end
    88 	LDMFD	r13!,{r4-r6,r11,PC}
    89 oc_frag_recon_intra_arm
    90 	; r0 =       unsigned char *_dst
    91 	; r1 =       int            _ystride
    92 	; r2 = const ogg_int16_t    _residue[64]
    93 	STMFD	r13!,{r4,r5,r14}
    94 	MOV	r14,#8
    95 	MOV	r5, #255
    96 	SUB	r1, r1, #7
    97 ofrintra_lp_arm
    98 	LDRSH	r3, [r2], #2
    99 	LDRSH	r4, [r2], #2
   100 	LDRSH	r12,[r2], #2
   101 	ADDS	r3, r3, #128
   102 	CMPGT	r5, r3
   103 	EORLT	r3, r5, r3, ASR #32
   104 	STRB	r3, [r0], #1
   105 	ADDS	r4, r4, #128
   106 	CMPGT	r5, r4
   107 	EORLT	r4, r5, r4, ASR #32
   108 	LDRSH	r3, [r2], #2
   109 	STRB	r4, [r0], #1
   110 	ADDS	r12,r12,#128
   111 	CMPGT	r5, r12
   112 	EORLT	r12,r5, r12,ASR #32
   113 	LDRSH	r4, [r2], #2
   114 	STRB	r12,[r0], #1
   115 	ADDS	r3, r3, #128
   116 	CMPGT	r5, r3
   117 	EORLT	r3, r5, r3, ASR #32
   118 	LDRSH	r12,[r2], #2
   119 	STRB	r3, [r0], #1
   120 	ADDS	r4, r4, #128
   121 	CMPGT	r5, r4
   122 	EORLT	r4, r5, r4, ASR #32
   123 	LDRSH	r3, [r2], #2
   124 	STRB	r4, [r0], #1
   125 	ADDS	r12,r12,#128
   126 	CMPGT	r5, r12
   127 	EORLT	r12,r5, r12,ASR #32
   128 	LDRSH	r4, [r2], #2
   129 	STRB	r12,[r0], #1
   130 	ADDS	r3, r3, #128
   131 	CMPGT	r5, r3
   132 	EORLT	r3, r5, r3, ASR #32
   133 	STRB	r3, [r0], #1
   134 	ADDS	r4, r4, #128
   135 	CMPGT	r5, r4
   136 	EORLT	r4, r5, r4, ASR #32
   137 	STRB	r4, [r0], r1
   138 	SUBS	r14,r14,#1
   139 	BGT	ofrintra_lp_arm
   140 	LDMFD	r13!,{r4,r5,PC}
   141 	ENDP
   143 oc_frag_recon_inter_arm PROC
   144 	; r0 =       unsigned char *dst
   145 	; r1 = const unsigned char *src
   146 	; r2 =       int            ystride
   147 	; r3 = const ogg_int16_t    residue[64]
   148 	STMFD	r13!,{r5,r9-r11,r14}
   149 	MOV	r9, #8
   150 	MOV	r5, #255
   151 	SUB	r2, r2, #7
   152 ofrinter_lp_arm
   153 	LDRSH	r12,[r3], #2
   154 	LDRB	r14,[r1], #1
   155 	LDRSH	r11,[r3], #2
   156 	LDRB	r10,[r1], #1
   157 	ADDS	r12,r12,r14
   158 	CMPGT	r5, r12
   159 	EORLT	r12,r5, r12,ASR #32
   160 	STRB	r12,[r0], #1
   161 	ADDS	r11,r11,r10
   162 	CMPGT	r5, r11
   163 	LDRSH	r12,[r3], #2
   164 	LDRB	r14,[r1], #1
   165 	EORLT	r11,r5, r11,ASR #32
   166 	STRB	r11,[r0], #1
   167 	ADDS	r12,r12,r14
   168 	CMPGT	r5, r12
   169 	LDRSH	r11,[r3], #2
   170 	LDRB	r10,[r1], #1
   171 	EORLT	r12,r5, r12,ASR #32
   172 	STRB	r12,[r0], #1
   173 	ADDS	r11,r11,r10
   174 	CMPGT	r5, r11
   175 	LDRSH	r12,[r3], #2
   176 	LDRB	r14,[r1], #1
   177 	EORLT	r11,r5, r11,ASR #32
   178 	STRB	r11,[r0], #1
   179 	ADDS	r12,r12,r14
   180 	CMPGT	r5, r12
   181 	LDRSH	r11,[r3], #2
   182 	LDRB	r10,[r1], #1
   183 	EORLT	r12,r5, r12,ASR #32
   184 	STRB	r12,[r0], #1
   185 	ADDS	r11,r11,r10
   186 	CMPGT	r5, r11
   187 	LDRSH	r12,[r3], #2
   188 	LDRB	r14,[r1], #1
   189 	EORLT	r11,r5, r11,ASR #32
   190 	STRB	r11,[r0], #1
   191 	ADDS	r12,r12,r14
   192 	CMPGT	r5, r12
   193 	LDRSH	r11,[r3], #2
   194 	LDRB	r10,[r1], r2
   195 	EORLT	r12,r5, r12,ASR #32
   196 	STRB	r12,[r0], #1
   197 	ADDS	r11,r11,r10
   198 	CMPGT	r5, r11
   199 	EORLT	r11,r5, r11,ASR #32
   200 	STRB	r11,[r0], r2
   201 	SUBS	r9, r9, #1
   202 	BGT	ofrinter_lp_arm
   203 	LDMFD	r13!,{r5,r9-r11,PC}
   204 	ENDP
   206 oc_frag_recon_inter2_arm PROC
   207 	; r0 =       unsigned char *dst
   208 	; r1 = const unsigned char *src1
   209 	; r2 = const unsigned char *src2
   210 	; r3 =       int            ystride
   211 	LDR	r12,[r13]
   212 	; r12= const ogg_int16_t    residue[64]
   213 	STMFD	r13!,{r4-r8,r14}
   214 	MOV	r14,#8
   215 	MOV	r8, #255
   216 	SUB	r3, r3, #7
   217 ofrinter2_lp_arm
   218 	LDRB	r5, [r1], #1
   219 	LDRB	r6, [r2], #1
   220 	LDRSH	r4, [r12],#2
   221 	LDRB	r7, [r1], #1
   222 	ADD	r5, r5, r6
   223 	ADDS	r5, r4, r5, LSR #1
   224 	CMPGT	r8, r5
   225 	LDRB	r6, [r2], #1
   226 	LDRSH	r4, [r12],#2
   227 	EORLT	r5, r8, r5, ASR #32
   228 	STRB	r5, [r0], #1
   229 	ADD	r7, r7, r6
   230 	ADDS	r7, r4, r7, LSR #1
   231 	CMPGT	r8, r7
   232 	LDRB	r5, [r1], #1
   233 	LDRB	r6, [r2], #1
   234 	LDRSH	r4, [r12],#2
   235 	EORLT	r7, r8, r7, ASR #32
   236 	STRB	r7, [r0], #1
   237 	ADD	r5, r5, r6
   238 	ADDS	r5, r4, r5, LSR #1
   239 	CMPGT	r8, r5
   240 	LDRB	r7, [r1], #1
   241 	LDRB	r6, [r2], #1
   242 	LDRSH	r4, [r12],#2
   243 	EORLT	r5, r8, r5, ASR #32
   244 	STRB	r5, [r0], #1
   245 	ADD	r7, r7, r6
   246 	ADDS	r7, r4, r7, LSR #1
   247 	CMPGT	r8, r7
   248 	LDRB	r5, [r1], #1
   249 	LDRB	r6, [r2], #1
   250 	LDRSH	r4, [r12],#2
   251 	EORLT	r7, r8, r7, ASR #32
   252 	STRB	r7, [r0], #1
   253 	ADD	r5, r5, r6
   254 	ADDS	r5, r4, r5, LSR #1
   255 	CMPGT	r8, r5
   256 	LDRB	r7, [r1], #1
   257 	LDRB	r6, [r2], #1
   258 	LDRSH	r4, [r12],#2
   259 	EORLT	r5, r8, r5, ASR #32
   260 	STRB	r5, [r0], #1
   261 	ADD	r7, r7, r6
   262 	ADDS	r7, r4, r7, LSR #1
   263 	CMPGT	r8, r7
   264 	LDRB	r5, [r1], #1
   265 	LDRB	r6, [r2], #1
   266 	LDRSH	r4, [r12],#2
   267 	EORLT	r7, r8, r7, ASR #32
   268 	STRB	r7, [r0], #1
   269 	ADD	r5, r5, r6
   270 	ADDS	r5, r4, r5, LSR #1
   271 	CMPGT	r8, r5
   272 	LDRB	r7, [r1], r3
   273 	LDRB	r6, [r2], r3
   274 	LDRSH	r4, [r12],#2
   275 	EORLT	r5, r8, r5, ASR #32
   276 	STRB	r5, [r0], #1
   277 	ADD	r7, r7, r6
   278 	ADDS	r7, r4, r7, LSR #1
   279 	CMPGT	r8, r7
   280 	EORLT	r7, r8, r7, ASR #32
   281 	STRB	r7, [r0], r3
   282 	SUBS	r14,r14,#1
   283 	BGT	ofrinter2_lp_arm
   284 	LDMFD	r13!,{r4-r8,PC}
   285 	ENDP
   287  [ OC_ARM_ASM_EDSP
   288 	EXPORT	oc_frag_copy_list_edsp
   290 oc_frag_copy_list_edsp PROC
   291 	; r0 = _dst_frame
   292 	; r1 = _src_frame
   293 	; r2 = _ystride
   294 	; r3 = _fragis
   295 	; <> = _nfragis
   296 	; <> = _frag_buf_offs
   297 	LDR	r12,[r13]		; r12 = _nfragis
   298 	STMFD	r13!,{r4-r11,r14}
   299 	SUBS	r12, r12, #1
   300 	LDRGE	r5, [r3],#4		; r5 = _fragis[fragii]
   301 	LDRGE	r14,[r13,#4*10]		; r14 = _frag_buf_offs
   302 	BLT	ofcl_edsp_end
   303 ofcl_edsp_lp
   304 	MOV	r4, r1
   305 	LDR	r5, [r14,r5, LSL #2]	; r5 = _frag_buf_offs[_fragis[fragii]]
   306 	SUBS	r12, r12, #1
   307 	; Stall (on XScale)
   308 	LDRD	r6, [r4, r5]!		; r4 = _src_frame+frag_buf_off
   309 	LDRD	r8, [r4, r2]!
   310 	; Stall
   311 	STRD	r6, [r5, r0]!		; r5 = _dst_frame+frag_buf_off
   312 	STRD	r8, [r5, r2]!
   313 	; Stall
   314 	LDRD	r6, [r4, r2]!	; On Xscale at least, doing 3 consecutive
   315 	LDRD	r8, [r4, r2]!	; loads causes a stall, but that's no worse
   316 	LDRD	r10,[r4, r2]!	; than us only doing 2, and having to do
   317 				; another pair of LDRD/STRD later on.
   318 	; Stall
   319 	STRD	r6, [r5, r2]!
   320 	STRD	r8, [r5, r2]!
   321 	STRD	r10,[r5, r2]!
   322 	LDRD	r6, [r4, r2]!
   323 	LDRD	r8, [r4, r2]!
   324 	LDRD	r10,[r4, r2]!
   325 	STRD	r6, [r5, r2]!
   326 	STRD	r8, [r5, r2]!
   327 	STRD	r10,[r5, r2]!
   328 	LDRGE	r5, [r3],#4		; r5 = _fragis[fragii]
   329 	BGE	ofcl_edsp_lp
   330 ofcl_edsp_end
   331 	LDMFD	r13!,{r4-r11,PC}
   332 	ENDP
   333  ]
   335  [ OC_ARM_ASM_MEDIA
   336 	EXPORT	oc_frag_recon_intra_v6
   337 	EXPORT	oc_frag_recon_inter_v6
   338 	EXPORT	oc_frag_recon_inter2_v6
   340 oc_frag_recon_intra_v6 PROC
   341 	; r0 =       unsigned char *_dst
   342 	; r1 =       int            _ystride
   343 	; r2 = const ogg_int16_t    _residue[64]
   344 	STMFD	r13!,{r4-r6,r14}
   345 	MOV	r14,#8
   346 	MOV	r12,r2
   347 	LDR	r6, =0x00800080
   348 ofrintra_v6_lp
   349 	LDRD	r2, [r12],#8	; r2 = 11110000 r3 = 33332222
   350 	LDRD	r4, [r12],#8	; r4 = 55554444 r5 = 77776666
   351 	SUBS	r14,r14,#1
   352 	QADD16	r2, r2, r6
   353 	QADD16	r3, r3, r6
   354 	QADD16	r4, r4, r6
   355 	QADD16	r5, r5, r6
   356 	USAT16	r2, #8, r2		; r2 = __11__00
   357 	USAT16	r3, #8, r3		; r3 = __33__22
   358 	USAT16	r4, #8, r4		; r4 = __55__44
   359 	USAT16	r5, #8, r5		; r5 = __77__66
   360 	ORR	r2, r2, r2, LSR #8	; r2 = __111100
   361 	ORR	r3, r3, r3, LSR #8	; r3 = __333322
   362 	ORR	r4, r4, r4, LSR #8	; r4 = __555544
   363 	ORR	r5, r5, r5, LSR #8	; r5 = __777766
   364 	PKHBT   r2, r2, r3, LSL #16     ; r2 = 33221100
   365 	PKHBT   r3, r4, r5, LSL #16     ; r3 = 77665544
   366 	STRD	r2, [r0], r1
   367 	BGT	ofrintra_v6_lp
   368 	LDMFD	r13!,{r4-r6,PC}
   369 	ENDP
   371 oc_frag_recon_inter_v6 PROC
   372 	; r0 =       unsigned char *_dst
   373 	; r1 = const unsigned char *_src
   374 	; r2 =       int            _ystride
   375 	; r3 = const ogg_int16_t    _residue[64]
   376 	STMFD	r13!,{r4-r7,r14}
   377 	MOV	r14,#8
   378 ofrinter_v6_lp
   379 	LDRD	r6, [r3], #8		; r6 = 11110000 r7 = 33332222
   380 	SUBS	r14,r14,#1
   381  [ OC_ARM_CAN_UNALIGN_LDRD
   382 	LDRD	r4, [r1], r2	; Unaligned ; r4 = 33221100 r5 = 77665544
   383  |
   384 	LDR	r5, [r1, #4]
   385 	LDR	r4, [r1], r2
   386  ]
   387 	PKHBT	r12,r6, r7, LSL #16	; r12= 22220000
   388 	PKHTB	r7, r7, r6, ASR #16	; r7 = 33331111
   389 	UXTB16	r6,r4			; r6 = __22__00
   390 	UXTB16	r4,r4, ROR #8		; r4 = __33__11
   391 	QADD16	r12,r12,r6		; r12= xx22xx00
   392 	QADD16	r4, r7, r4		; r4 = xx33xx11
   393 	LDRD	r6, [r3], #8		; r6 = 55554444 r7 = 77776666
   394 	USAT16	r4, #8, r4		; r4 = __33__11
   395 	USAT16	r12,#8,r12		; r12= __22__00
   396 	ORR	r4, r12,r4, LSL #8	; r4 = 33221100
   397 	PKHBT	r12,r6, r7, LSL #16	; r12= 66664444
   398 	PKHTB	r7, r7, r6, ASR #16	; r7 = 77775555
   399 	UXTB16	r6,r5			; r6 = __66__44
   400 	UXTB16	r5,r5, ROR #8		; r5 = __77__55
   401 	QADD16	r12,r12,r6		; r12= xx66xx44
   402 	QADD16	r5, r7, r5		; r5 = xx77xx55
   403 	USAT16	r12,#8, r12		; r12= __66__44
   404 	USAT16	r5, #8, r5		; r4 = __77__55
   405 	ORR	r5, r12,r5, LSL #8	; r5 = 33221100
   406 	STRD	r4, [r0], r2
   407 	BGT	ofrinter_v6_lp
   408 	LDMFD	r13!,{r4-r7,PC}
   409 	ENDP
   411 oc_frag_recon_inter2_v6 PROC
   412 	; r0 =       unsigned char *_dst
   413 	; r1 = const unsigned char *_src1
   414 	; r2 = const unsigned char *_src2
   415 	; r3 =       int            _ystride
   416 	LDR	r12,[r13]
   417 	; r12= const ogg_int16_t    _residue[64]
   418 	STMFD	r13!,{r4-r9,r14}
   419 	MOV	r14,#8
   420 ofrinter2_v6_lp
   421 	LDRD	r6, [r12,#8]	; r6 = 55554444 r7 = 77776666
   422 	SUBS	r14,r14,#1
   423 	LDR	r4, [r1, #4]	; Unaligned	; r4 = src1[1] = 77665544
   424 	LDR	r5, [r2, #4]	; Unaligned	; r5 = src2[1] = 77665544
   425 	PKHBT	r8, r6, r7, LSL #16	; r8 = 66664444
   426 	PKHTB	r9, r7, r6, ASR #16	; r9 = 77775555
   427 	UHADD8	r4, r4, r5	; r4 = (src1[7,6,5,4] + src2[7,6,5,4])>>1
   428 	UXTB16	r5, r4			; r5 = __66__44
   429 	UXTB16	r4, r4, ROR #8		; r4 = __77__55
   430 	QADD16	r8, r8, r5		; r8 = xx66xx44
   431 	QADD16	r9, r9, r4		; r9 = xx77xx55
   432 	LDRD	r6,[r12],#16	; r6 = 33332222 r7 = 11110000
   433 	USAT16	r8, #8, r8		; r8 = __66__44
   434 	LDR	r4, [r1], r3	; Unaligned	; r4 = src1[0] = 33221100
   435 	USAT16	r9, #8, r9		; r9 = __77__55
   436 	LDR	r5, [r2], r3	; Unaligned	; r5 = src2[0] = 33221100
   437 	ORR	r9, r8, r9, LSL #8	; r9 = 77665544
   438 	PKHBT	r8, r6, r7, LSL #16	; r8 = 22220000
   439 	UHADD8	r4, r4, r5	; r4 = (src1[3,2,1,0] + src2[3,2,1,0])>>1
   440 	PKHTB	r7, r7, r6, ASR #16	; r7 = 33331111
   441 	UXTB16	r5, r4			; r5 = __22__00
   442 	UXTB16	r4, r4, ROR #8		; r4 = __33__11
   443 	QADD16	r8, r8, r5		; r8 = xx22xx00
   444 	QADD16	r7, r7, r4		; r7 = xx33xx11
   445 	USAT16	r8, #8, r8		; r8 = __22__00
   446 	USAT16	r7, #8, r7		; r7 = __33__11
   447 	ORR	r8, r8, r7, LSL #8	; r8 = 33221100
   448 	STRD	r8, [r0], r3
   449 	BGT	ofrinter2_v6_lp
   450 	LDMFD	r13!,{r4-r9,PC}
   451 	ENDP
   452  ]
   454  [ OC_ARM_ASM_NEON
   455 	EXPORT	oc_frag_copy_list_neon
   456 	EXPORT	oc_frag_recon_intra_neon
   457 	EXPORT	oc_frag_recon_inter_neon
   458 	EXPORT	oc_frag_recon_inter2_neon
   460 oc_frag_copy_list_neon PROC
   461 	; r0 = _dst_frame
   462 	; r1 = _src_frame
   463 	; r2 = _ystride
   464 	; r3 = _fragis
   465 	; <> = _nfragis
   466 	; <> = _frag_buf_offs
   467 	LDR	r12,[r13]		; r12 = _nfragis
   468 	STMFD	r13!,{r4-r7,r14}
   469 	CMP	r12, #1
   470 	LDRGE	r6, [r3]		; r6 = _fragis[fragii]
   471 	LDRGE	r14,[r13,#4*6]		; r14 = _frag_buf_offs
   472 	BLT	ofcl_neon_end
   473 	; Stall (2 on Xscale)
   474 	LDR	r6, [r14,r6, LSL #2]	; r6 = _frag_buf_offs[_fragis[fragii]]
   475 	; Stall (on XScale)
   476 	MOV	r7, r6			; Guarantee PLD points somewhere valid.
   477 ofcl_neon_lp
   478 	ADD	r4, r1, r6
   479 	VLD1.64	{D0}, [r4@64], r2
   480 	ADD	r5, r0, r6
   481 	VLD1.64	{D1}, [r4@64], r2
   482 	SUBS	r12, r12, #1
   483 	VLD1.64	{D2}, [r4@64], r2
   484 	LDRGT	r6, [r3,#4]!		; r6 = _fragis[fragii]
   485 	VLD1.64	{D3}, [r4@64], r2
   486 	LDRGT	r6, [r14,r6, LSL #2]	; r6 = _frag_buf_offs[_fragis[fragii]]
   487 	VLD1.64	{D4}, [r4@64], r2
   488 	ADDGT	r7, r1, r6
   489 	VLD1.64	{D5}, [r4@64], r2
   490 	PLD	[r7]
   491 	VLD1.64	{D6}, [r4@64], r2
   492 	PLD	[r7, r2]
   493 	VLD1.64	{D7}, [r4@64]
   494 	PLD	[r7, r2, LSL #1]
   495 	VST1.64	{D0}, [r5@64], r2
   496 	ADDGT	r7, r7, r2, LSL #2
   497 	VST1.64	{D1}, [r5@64], r2
   498 	PLD	[r7, -r2]
   499 	VST1.64	{D2}, [r5@64], r2
   500 	PLD	[r7]
   501 	VST1.64	{D3}, [r5@64], r2
   502 	PLD	[r7, r2]
   503 	VST1.64	{D4}, [r5@64], r2
   504 	PLD	[r7, r2, LSL #1]
   505 	VST1.64	{D5}, [r5@64], r2
   506 	ADDGT	r7, r7, r2, LSL #2
   507 	VST1.64	{D6}, [r5@64], r2
   508 	PLD	[r7, -r2]
   509 	VST1.64	{D7}, [r5@64]
   510 	BGT	ofcl_neon_lp
   511 ofcl_neon_end
   512 	LDMFD	r13!,{r4-r7,PC}
   513 	ENDP
   515 oc_frag_recon_intra_neon PROC
   516 	; r0 =       unsigned char *_dst
   517 	; r1 =       int            _ystride
   518 	; r2 = const ogg_int16_t    _residue[64]
   519 	MOV	r3, #128
   520 	VDUP.S16	Q0, r3
   521 	VLDMIA	r2,  {D16-D31}	; D16= 3333222211110000 etc	; 9(8) cycles
   522 	VQADD.S16	Q8, Q8, Q0
   523 	VQADD.S16	Q9, Q9, Q0
   524 	VQADD.S16	Q10,Q10,Q0
   525 	VQADD.S16	Q11,Q11,Q0
   526 	VQADD.S16	Q12,Q12,Q0
   527 	VQADD.S16	Q13,Q13,Q0
   528 	VQADD.S16	Q14,Q14,Q0
   529 	VQADD.S16	Q15,Q15,Q0
   530 	VQMOVUN.S16	D16,Q8	; D16= 7766554433221100		; 1 cycle
   531 	VQMOVUN.S16	D17,Q9	; D17= FFEEDDCCBBAA9988		; 1 cycle
   532 	VQMOVUN.S16	D18,Q10	; D18= NNMMLLKKJJIIHHGG		; 1 cycle
   533 	VST1.64	{D16},[r0@64], r1
   534 	VQMOVUN.S16	D19,Q11	; D19= VVUUTTSSRRQQPPOO		; 1 cycle
   535 	VST1.64	{D17},[r0@64], r1
   536 	VQMOVUN.S16	D20,Q12	; D20= ddccbbaaZZYYXXWW		; 1 cycle
   537 	VST1.64	{D18},[r0@64], r1
   538 	VQMOVUN.S16	D21,Q13	; D21= llkkjjiihhggffee		; 1 cycle
   539 	VST1.64	{D19},[r0@64], r1
   540 	VQMOVUN.S16	D22,Q14	; D22= ttssrrqqppoonnmm		; 1 cycle
   541 	VST1.64	{D20},[r0@64], r1
   542 	VQMOVUN.S16	D23,Q15	; D23= !!@@zzyyxxwwvvuu		; 1 cycle
   543 	VST1.64	{D21},[r0@64], r1
   544 	VST1.64	{D22},[r0@64], r1
   545 	VST1.64	{D23},[r0@64], r1
   546 	MOV	PC,R14
   547 	ENDP
   549 oc_frag_recon_inter_neon PROC
   550 	; r0 =       unsigned char *_dst
   551 	; r1 = const unsigned char *_src
   552 	; r2 =       int            _ystride
   553 	; r3 = const ogg_int16_t    _residue[64]
   554 	VLDMIA	r3, {D16-D31}	; D16= 3333222211110000 etc	; 9(8) cycles
   555 	VLD1.64	{D0}, [r1], r2
   556 	VLD1.64	{D2}, [r1], r2
   557 	VMOVL.U8	Q0, D0	; Q0 = __77__66__55__44__33__22__11__00
   558 	VLD1.64	{D4}, [r1], r2
   559 	VMOVL.U8	Q1, D2	; etc
   560 	VLD1.64	{D6}, [r1], r2
   561 	VMOVL.U8	Q2, D4
   562 	VMOVL.U8	Q3, D6
   563 	VQADD.S16	Q8, Q8, Q0
   564 	VLD1.64	{D0}, [r1], r2
   565 	VQADD.S16	Q9, Q9, Q1
   566 	VLD1.64	{D2}, [r1], r2
   567 	VQADD.S16	Q10,Q10,Q2
   568 	VLD1.64	{D4}, [r1], r2
   569 	VQADD.S16	Q11,Q11,Q3
   570 	VLD1.64	{D6}, [r1], r2
   571 	VMOVL.U8	Q0, D0
   572 	VMOVL.U8	Q1, D2
   573 	VMOVL.U8	Q2, D4
   574 	VMOVL.U8	Q3, D6
   575 	VQADD.S16	Q12,Q12,Q0
   576 	VQADD.S16	Q13,Q13,Q1
   577 	VQADD.S16	Q14,Q14,Q2
   578 	VQADD.S16	Q15,Q15,Q3
   579 	VQMOVUN.S16	D16,Q8
   580 	VQMOVUN.S16	D17,Q9
   581 	VQMOVUN.S16	D18,Q10
   582 	VST1.64	{D16},[r0@64], r2
   583 	VQMOVUN.S16	D19,Q11
   584 	VST1.64	{D17},[r0@64], r2
   585 	VQMOVUN.S16	D20,Q12
   586 	VST1.64	{D18},[r0@64], r2
   587 	VQMOVUN.S16	D21,Q13
   588 	VST1.64	{D19},[r0@64], r2
   589 	VQMOVUN.S16	D22,Q14
   590 	VST1.64	{D20},[r0@64], r2
   591 	VQMOVUN.S16	D23,Q15
   592 	VST1.64	{D21},[r0@64], r2
   593 	VST1.64	{D22},[r0@64], r2
   594 	VST1.64	{D23},[r0@64], r2
   595 	MOV	PC,R14
   596 	ENDP
   598 oc_frag_recon_inter2_neon PROC
   599 	; r0 =       unsigned char *_dst
   600 	; r1 = const unsigned char *_src1
   601 	; r2 = const unsigned char *_src2
   602 	; r3 =       int            _ystride
   603 	LDR	r12,[r13]
   604 	; r12= const ogg_int16_t    _residue[64]
   605 	VLDMIA	r12,{D16-D31}
   606 	VLD1.64	{D0}, [r1], r3
   607 	VLD1.64	{D4}, [r2], r3
   608 	VLD1.64	{D1}, [r1], r3
   609 	VLD1.64	{D5}, [r2], r3
   610 	VHADD.U8	Q2, Q0, Q2	; Q2 = FFEEDDCCBBAA99887766554433221100
   611 	VLD1.64	{D2}, [r1], r3
   612 	VLD1.64	{D6}, [r2], r3
   613 	VMOVL.U8	Q0, D4		; Q0 = __77__66__55__44__33__22__11__00
   614 	VLD1.64	{D3}, [r1], r3
   615 	VMOVL.U8	Q2, D5		; etc
   616 	VLD1.64	{D7}, [r2], r3
   617 	VHADD.U8	Q3, Q1, Q3
   618 	VQADD.S16	Q8, Q8, Q0
   619 	VQADD.S16	Q9, Q9, Q2
   620 	VLD1.64	{D0}, [r1], r3
   621 	VMOVL.U8	Q1, D6
   622 	VLD1.64	{D4}, [r2], r3
   623 	VMOVL.U8	Q3, D7
   624 	VLD1.64	{D1}, [r1], r3
   625 	VQADD.S16	Q10,Q10,Q1
   626 	VLD1.64	{D5}, [r2], r3
   627 	VQADD.S16	Q11,Q11,Q3
   628 	VLD1.64	{D2}, [r1], r3
   629 	VHADD.U8	Q2, Q0, Q2
   630 	VLD1.64	{D6}, [r2], r3
   631 	VLD1.64	{D3}, [r1], r3
   632 	VMOVL.U8	Q0, D4
   633 	VLD1.64	{D7}, [r2], r3
   634 	VMOVL.U8	Q2, D5
   635 	VHADD.U8	Q3, Q1, Q3
   636 	VQADD.S16	Q12,Q12,Q0
   637 	VQADD.S16	Q13,Q13,Q2
   638 	VMOVL.U8	Q1, D6
   639 	VMOVL.U8	Q3, D7
   640 	VQADD.S16	Q14,Q14,Q1
   641 	VQADD.S16	Q15,Q15,Q3
   642 	VQMOVUN.S16	D16,Q8
   643 	VQMOVUN.S16	D17,Q9
   644 	VQMOVUN.S16	D18,Q10
   645 	VST1.64	{D16},[r0@64], r3
   646 	VQMOVUN.S16	D19,Q11
   647 	VST1.64	{D17},[r0@64], r3
   648 	VQMOVUN.S16	D20,Q12
   649 	VST1.64	{D18},[r0@64], r3
   650 	VQMOVUN.S16	D21,Q13
   651 	VST1.64	{D19},[r0@64], r3
   652 	VQMOVUN.S16	D22,Q14
   653 	VST1.64	{D20},[r0@64], r3
   654 	VQMOVUN.S16	D23,Q15
   655 	VST1.64	{D21},[r0@64], r3
   656 	VST1.64	{D22},[r0@64], r3
   657 	VST1.64	{D23},[r0@64], r3
   658 	MOV	PC,R14
   659 	ENDP
   660  ]
   662 	END

mercurial