The Tor Browser: gfx/ycbcr/yuv_row

Conditionally force memory storage according to privacy.thirdparty.isolate;
This solves Tor bug #9701, complying with disk avoidance documented in
https://www.torproject.org/projects/torbrowser/design/#disk-avoidance.

     1 /* This Source Code Form is subject to the terms of the Mozilla Public

     2  * License, v. 2.0. If a copy of the MPL was not distributed with this

     3  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

     5     .arch   armv7-a

     6     .fpu    neon

     7 /* Allow to build on targets not supporting neon, and force the object file

     8  * target to avoid bumping the final binary target */

     9     .object_arch armv4t

    10     .text

    11     .align

    13     .balign 64

    14 YCbCr42xToRGB565_DITHER03_CONSTS_NEON:

    15     .short -14240

    16     .short -14240+384

    17     .short   8672

    18     .short   8672+192

    19     .short -17696

    20     .short -17696+384

    21     .byte 102

    22     .byte  25

    23     .byte  52

    24     .byte 129

    25 YCbCr42xToRGB565_DITHER12_CONSTS_NEON:

    26     .short -14240+128

    27     .short -14240+256

    28     .short   8672+64

    29     .short   8672+128

    30     .short -17696+128

    31     .short -17696+256

    32     .byte 102

    33     .byte  25

    34     .byte  52

    35     .byte 129

    36 YCbCr42xToRGB565_DITHER21_CONSTS_NEON:

    37     .short -14240+256

    38     .short -14240+128

    39     .short   8672+128

    40     .short   8672+64

    41     .short -17696+256

    42     .short -17696+128

    43     .byte 102

    44     .byte  25

    45     .byte  52

    46     .byte 129

    47 YCbCr42xToRGB565_DITHER30_CONSTS_NEON:

    48     .short -14240+384

    49     .short -14240

    50     .short   8672+192

    51     .short   8672

    52     .short -17696+384

    53     .short -17696

    54     .byte 102

    55     .byte  25

    56     .byte  52

    57     .byte 129

    59 @ void ScaleYCbCr42xToRGB565_BilinearY_Row_NEON(

    60 @  yuv2rgb565_row_scale_bilinear_ctx *ctx, int dither);

    61 @

    62 @ ctx = {

    63 @   uint16_t *rgb_row;       /*r0*/

    64 @   const uint8_t *y_row;    /*r1*/

    65 @   const uint8_t *u_row;    /*r2*/

    66 @   const uint8_t *v_row;    /*r3*/

    67 @   int y_yweight;           /*r4*/

    68 @   int y_pitch;             /*r5*/

    69 @   int width;               /*r6*/

    70 @   int source_x0_q16;       /*r7*/

    71 @   int source_dx_q16;       /*r8*/

    72 @   int source_uv_xoffs_q16; /*r9*/

    73 @ };

    74     .global ScaleYCbCr42xToRGB565_BilinearY_Row_NEON

    75     .type   ScaleYCbCr42xToRGB565_BilinearY_Row_NEON, %function

    76     .balign 64

    77     .fnstart

    78 ScaleYCbCr42xToRGB565_BilinearY_Row_NEON:

    79     STMFD       r13!,{r4-r9,r14}       @ 8 words.

    80     ADR         r14,YCbCr42xToRGB565_DITHER03_CONSTS_NEON

    81     VPUSH       {Q4-Q7}                @ 16 words.

    82     ADD         r14,r14,r1, LSL #4     @ Select the dither table to use

    83     LDMIA       r0, {r0-r9}

    84     @ Set up image index registers.

    85     ADD         r12,r8, r8

    86     VMOV.I32    D16,#0         @ Q8 = < 2| 2| 0| 0>*source_dx_q16

    87     VDUP.32     D17,r12

    88     ADD         r12,r12,r12

    89     VTRN.32     D16,D17        @ Q2 = < 2| 0| 2| 0>*source_dx_q16

    90     VDUP.32     D19,r12        @ Q9 = < 4| 4| ?| ?>*source_dx_q16

    91     ADD         r12,r12,r12

    92     VDUP.32     Q0, r7         @ Q0 = < 1| 1| 1| 1>*source_x0_q16

    93     VADD.I32    D17,D17,D19    @ Q8 = < 6| 4| 2| 0>*source_dx_q16

    94     CMP         r8, #0                 @ If source_dx_q16 is negative...

    95     VDUP.32     Q9, r12        @ Q9 = < 8| 8| 8| 8>*source_dx_q16

    96     ADDLT       r7, r7, r8, LSL #4     @ Make r7 point to the end of the block

    97     VADD.I32    Q0, Q0, Q8     @ Q0 = < 6| 4| 2| 0>*source_dx_q16+source_x0_q16

    98     SUBLT       r7, r7, r8             @ (i.e., the lowest address we'll use)

    99     VADD.I32    Q1, Q0, Q9     @ Q1 = <14|12|10| 8>*source_dx_q16+source_x0_q16

   100     VDUP.I32    Q9, r8         @ Q8 = < 1| 1| 1| 1>*source_dx_q16

   101     VADD.I32    Q2, Q0, Q9     @ Q2 = < 7| 5| 3| 1>*source_dx_q16+source_x0_q16

   102     VADD.I32    Q3, Q1, Q9     @ Q3 = <15|13|11| 9>*source_dx_q16+source_x0_q16

   103     VLD1.64     {D30,D31},[r14,:128]   @ Load some constants

   104     VMOV.I8     D28,#52

   105     VMOV.I8     D29,#129

   106     @ The basic idea here is to do aligned loads of a block of data and then

   107     @  index into it using VTBL to extract the data from the source X

   108     @  coordinate corresponding to each destination pixel.

   109     @ This is significantly less code and significantly fewer cycles than doing

   110     @  a series of single-lane loads, but it means that the X step between

   111     @  pixels must be limited to 2.0 or less, otherwise we couldn't guarantee

   112     @  that we could read 8 pixels from a single aligned 32-byte block of data.

   113     @ Q0...Q3 contain the 16.16 fixed-point X coordinates of each pixel,

   114     @  separated into even pixels and odd pixels to make extracting offsets and

   115     @  weights easier.

   116     @ We then pull out two bytes from the middle of each coordinate: the top

   117     @  byte corresponds to the integer part of the X coordinate, and the bottom

   118     @  byte corresponds to the weight to use for bilinear blending.

   119     @ These are separated out into different registers with VTRN.

   120     @ Then by subtracting the integer X coordinate of the first pixel in the

   121     @  data block we loaded, we produce an index register suitable for use by

   122     @  VTBL.

   123 s42xbily_neon_loop:

   124     @ Load the Y' data.

   125     MOV         r12,r7, ASR #16

   126     VRSHRN.S32  D16,Q0, #8

   127     AND         r12,r12,#~15   @ Read 16-byte aligned blocks

   128     VDUP.I8     D20,r12

   129     ADD         r12,r1, r12    @ r12 = y_row+(source_x&~7)

   130     VRSHRN.S32  D17,Q1, #8

   131     PLD         [r12,#64]

   132     VLD1.64     {D8, D9, D10,D11},[r12,:128],r5        @ Load Y' top row

   133     ADD         r14,r7, r8, LSL #3

   134     VRSHRN.S32  D18,Q2, #8

   135     MOV         r14,r14,ASR #16

   136     VRSHRN.S32  D19,Q3, #8

   137     AND         r14,r14,#~15   @ Read 16-byte aligned blocks

   138     VLD1.64     {D12,D13,D14,D15},[r12,:128]           @ Load Y' bottom row

   139     PLD         [r12,#64]

   140     VDUP.I8     D21,r14

   141     ADD         r14,r1, r14    @ r14 = y_row+(source_x&~7)

   142     VMOV.I8     Q13,#1

   143     PLD         [r14,#64]

   144     VTRN.8      Q8, Q9         @ Q8  = <wFwEwDwCwBwAw9w8w7w6w5w4w3w2w1w0>

   145                                @ Q9  = <xFxExDxCxBxAx9x8x7x6x5x4x3x2x1x0>

   146     VSUB.S8     Q9, Q9, Q10    @ Make offsets relative to the data we loaded.

   147     @ First 8 Y' pixels

   148     VTBL.8      D20,{D8, D9, D10,D11},D18      @ Index top row at source_x

   149     VTBL.8      D24,{D12,D13,D14,D15},D18      @ Index bottom row at source_x

   150     VADD.S8     Q13,Q9, Q13                    @ Add 1 to source_x

   151     VTBL.8      D22,{D8, D9, D10,D11},D26      @ Index top row at source_x+1

   152     VTBL.8      D26,{D12,D13,D14,D15},D26      @ Index bottom row at source_x+1

   153     @ Next 8 Y' pixels

   154     VLD1.64     {D8, D9, D10,D11},[r14,:128],r5        @ Load Y' top row

   155     VLD1.64     {D12,D13,D14,D15},[r14,:128]           @ Load Y' bottom row

   156     PLD         [r14,#64]

   157     VTBL.8      D21,{D8, D9, D10,D11},D19      @ Index top row at source_x

   158     VTBL.8      D25,{D12,D13,D14,D15},D19      @ Index bottom row at source_x

   159     VTBL.8      D23,{D8, D9, D10,D11},D27      @ Index top row at source_x+1

   160     VTBL.8      D27,{D12,D13,D14,D15},D27      @ Index bottom row at source_x+1

   161     @ Blend Y'.

   162     VDUP.I16    Q9, r4         @ Load the y weights.

   163     VSUBL.U8    Q4, D24,D20    @ Q5:Q4 = c-a

   164     VSUBL.U8    Q5, D25,D21

   165     VSUBL.U8    Q6, D26,D22    @ Q7:Q6 = d-b

   166     VSUBL.U8    Q7, D27,D23

   167     VMUL.S16    Q4, Q4, Q9     @ Q5:Q4 = (c-a)*yweight

   168     VMUL.S16    Q5, Q5, Q9

   169     VMUL.S16    Q6, Q6, Q9     @ Q7:Q6 = (d-b)*yweight

   170     VMUL.S16    Q7, Q7, Q9

   171     VMOVL.U8    Q12,D16        @ Promote the x weights to 16 bits.

   172     VMOVL.U8    Q13,D17        @ Sadly, there's no VMULW.

   173     VRSHRN.S16  D8, Q4, #8     @ Q4 = (c-a)*yweight+128>>8

   174     VRSHRN.S16  D9, Q5, #8

   175     VRSHRN.S16  D12,Q6, #8     @ Q6 = (d-b)*yweight+128>>8

   176     VRSHRN.S16  D13,Q7, #8

   177     VADD.I8     Q10,Q10,Q4     @ Q10 = a+((c-a)*yweight+128>>8)

   178     VADD.I8     Q11,Q11,Q6     @ Q11 = b+((d-b)*yweight+128>>8)

   179     VSUBL.U8    Q4, D22,D20    @ Q5:Q4 = b-a

   180     VSUBL.U8    Q5, D23,D21

   181     VMUL.S16    Q4, Q4, Q12    @ Q5:Q4 = (b-a)*xweight

   182     VMUL.S16    Q5, Q5, Q13

   183     VRSHRN.S16  D8, Q4, #8     @ Q4 = (b-a)*xweight+128>>8

   184     ADD         r12,r7, r9

   185     VRSHRN.S16  D9, Q5, #8

   186     MOV         r12,r12,ASR #17

   187     VADD.I8     Q8, Q10,Q4     @ Q8 = a+((b-a)*xweight+128>>8)

   188     @ Start extracting the chroma x coordinates, and load Cb and Cr.

   189     AND         r12,r12,#~15   @ Read 16-byte aligned blocks

   190     VDUP.I32    Q9, r9         @ Q9 = source_uv_xoffs_q16 x 4

   191     ADD         r14,r2, r12

   192     VADD.I32    Q10,Q0, Q9

   193     VLD1.64     {D8, D9, D10,D11},[r14,:128]   @ Load Cb

   194     PLD         [r14,#64]

   195     VADD.I32    Q11,Q1, Q9

   196     ADD         r14,r3, r12

   197     VADD.I32    Q12,Q2, Q9

   198     VLD1.64     {D12,D13,D14,D15},[r14,:128]   @ Load Cr

   199     PLD         [r14,#64]

   200     VADD.I32    Q13,Q3, Q9

   201     VRSHRN.S32  D20,Q10,#9     @ Q10 = <xEwExCwCxAwAx8w8x6w6x4w4x2w2x0w0>

   202     VRSHRN.S32  D21,Q11,#9

   203     VDUP.I8     Q9, r12

   204     VRSHRN.S32  D22,Q12,#9     @ Q11 = <xFwFxDwDxBwBx9w9x7w7x5w5x3w3x1w1>

   205     VRSHRN.S32  D23,Q13,#9

   206     @ We don't actually need the x weights, but we get them for free.

   207     @ Free ALU slot

   208     VTRN.8      Q10,Q11        @ Q10 = <wFwEwDwCwBwAw9w8w7w6w5w4w3w2w1w0>

   209     @ Free ALU slot            @ Q11 = <xFxExDxCxBxAx9x8x7x6x5x4x3x2x1x0>

   210     VSUB.S8     Q11,Q11,Q9     @ Make offsets relative to the data we loaded.

   211     VTBL.8      D18,{D8, D9, D10,D11},D22      @ Index Cb at source_x

   212     VMOV.I8     D24,#74

   213     VTBL.8      D19,{D8, D9, D10,D11},D23

   214     VMOV.I8     D26,#102

   215     VTBL.8      D20,{D12,D13,D14,D15},D22      @ Index Cr at source_x

   216     VMOV.I8     D27,#25

   217     VTBL.8      D21,{D12,D13,D14,D15},D23

   218     @ We now have Y' in Q8, Cb in Q9, and Cr in Q10

   219     @ We use VDUP to expand constants, because it's a permute instruction, so

   220     @  it can dual issue on the A8.

   221     SUBS        r6, r6, #16    @ width -= 16

   222     VMULL.U8    Q4, D16,D24    @  Q5:Q4  = Y'*74

   223     VDUP.32     Q6, D30[1]     @  Q7:Q6  = bias_G

   224     VMULL.U8    Q5, D17,D24

   225     VDUP.32     Q7, D30[1]

   226     VMLSL.U8    Q6, D18,D27    @  Q7:Q6  = -25*Cb+bias_G

   227     VDUP.32     Q11,D30[0]     @ Q12:Q11 = bias_R

   228     VMLSL.U8    Q7, D19,D27

   229     VDUP.32     Q12,D30[0]

   230     VMLAL.U8    Q11,D20,D26    @ Q12:Q11 = 102*Cr+bias_R

   231     VDUP.32     Q8, D31[0]     @ Q13:Q8  = bias_B

   232     VMLAL.U8    Q12,D21,D26

   233     VDUP.32     Q13,D31[0]

   234     VMLAL.U8    Q8, D18,D29    @ Q13:Q8  = 129*Cb+bias_B

   235     VMLAL.U8    Q13,D19,D29

   236     VMLSL.U8    Q6, D20,D28    @  Q7:Q6  = -25*Cb-52*Cr+bias_G

   237     VMLSL.U8    Q7, D21,D28

   238     VADD.S16    Q11,Q4, Q11    @ Q12:Q11 = 74*Y'+102*Cr+bias_R

   239     VADD.S16    Q12,Q5, Q12

   240     VQADD.S16   Q8, Q4, Q8     @ Q13:Q8  = 74*Y'+129*Cr+bias_B

   241     VQADD.S16   Q13,Q5, Q13

   242     VADD.S16    Q6, Q4, Q6     @  Q7:Q6  = 74*Y'-25*Cb-52*Cr+bias_G

   243     VADD.S16    Q7, Q5, Q7

   244     @ Push each value to the top of its word and saturate it.

   245     VQSHLU.S16 Q11,Q11,#2

   246     VQSHLU.S16 Q12,Q12,#2

   247     VQSHLU.S16 Q6, Q6, #2

   248     VQSHLU.S16 Q7, Q7, #2

   249     VQSHLU.S16 Q8, Q8, #2

   250     VQSHLU.S16 Q13,Q13,#2

   251     @ Merge G and B into R.

   252     VSRI.U16   Q11,Q6, #5

   253     VSRI.U16   Q12,Q7, #5

   254     VSRI.U16   Q11,Q8, #11

   255     MOV         r14,r8, LSL #4

   256     VSRI.U16   Q12,Q13,#11

   257     BLT s42xbily_neon_tail

   258     VDUP.I32    Q13,r14

   259     @ Store the result.

   260     VST1.16     {D22,D23,D24,D25},[r0]!

   261     BEQ s42xbily_neon_done

   262     @ Advance the x coordinates.

   263     VADD.I32    Q0, Q0, Q13

   264     VADD.I32    Q1, Q1, Q13

   265     ADD         r7, r14

   266     VADD.I32    Q2, Q2, Q13

   267     VADD.I32    Q3, Q3, Q13

   268     B s42xbily_neon_loop

   269 s42xbily_neon_tail:

   270     @ We have between 1 and 15 pixels left to write.

   271     @ -r6 == the number of pixels we need to skip writing.

   272     @ Adjust r0 to point to the last one we need to write, because we're going

   273     @  to write them in reverse order.

   274     ADD         r0, r0, r6, LSL #1

   275     MOV         r14,#-2

   276     ADD         r0, r0, #30

   277     @ Skip past the ones we don't need to write.

   278     SUB         PC, PC, r6, LSL #2

   279     ORR         r0, r0, r0

   280     VST1.16     {D25[3]},[r0,:16],r14

   281     VST1.16     {D25[2]},[r0,:16],r14

   282     VST1.16     {D25[1]},[r0,:16],r14

   283     VST1.16     {D25[0]},[r0,:16],r14

   284     VST1.16     {D24[3]},[r0,:16],r14

   285     VST1.16     {D24[2]},[r0,:16],r14

   286     VST1.16     {D24[1]},[r0,:16],r14

   287     VST1.16     {D24[0]},[r0,:16],r14

   288     VST1.16     {D23[3]},[r0,:16],r14

   289     VST1.16     {D23[2]},[r0,:16],r14

   290     VST1.16     {D23[1]},[r0,:16],r14

   291     VST1.16     {D23[0]},[r0,:16],r14

   292     VST1.16     {D22[3]},[r0,:16],r14

   293     VST1.16     {D22[2]},[r0,:16],r14

   294     VST1.16     {D22[1]},[r0,:16],r14

   295     VST1.16     {D22[0]},[r0,:16]

   296 s42xbily_neon_done:

   297     VPOP        {Q4-Q7}                @ 16 words.

   298     LDMFD       r13!,{r4-r9,PC}        @ 8 words.

   299     .fnend

   300     .size ScaleYCbCr42xToRGB565_BilinearY_Row_NEON, .-ScaleYCbCr42xToRGB565_BilinearY_Row_NEON

   302 #if defined(__ELF__)&&defined(__linux__)

   303     .section .note.GNU-stack,"",%progbits

   304 #endif

The Tor Browser / file revision

gfx/ycbcr/yuv_row_arm.s@97036ab72558

gfx/ycbcr/yuv_row_arm.s