DXR is a code search and navigation tool aimed at making sense of large projects. It supports full-text and regex searches as well as structural queries.

Mercurial (5b81998bb7ab)

VCS Links

Line Code
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304
/* This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

    .arch   armv7-a
    .fpu    neon
/* Allow to build on targets not supporting neon, and force the object file
 * target to avoid bumping the final binary target */
    .object_arch armv4t
    .text
    .align

    .balign 64
YCbCr42xToRGB565_DITHER03_CONSTS_NEON:
    .short -14240
    .short -14240+384
    .short   8672
    .short   8672+192
    .short -17696
    .short -17696+384
    .byte 102
    .byte  25
    .byte  52
    .byte 129
YCbCr42xToRGB565_DITHER12_CONSTS_NEON:
    .short -14240+128
    .short -14240+256
    .short   8672+64
    .short   8672+128
    .short -17696+128
    .short -17696+256
    .byte 102
    .byte  25
    .byte  52
    .byte 129
YCbCr42xToRGB565_DITHER21_CONSTS_NEON:
    .short -14240+256
    .short -14240+128
    .short   8672+128
    .short   8672+64
    .short -17696+256
    .short -17696+128
    .byte 102
    .byte  25
    .byte  52
    .byte 129
YCbCr42xToRGB565_DITHER30_CONSTS_NEON:
    .short -14240+384
    .short -14240
    .short   8672+192
    .short   8672
    .short -17696+384
    .short -17696
    .byte 102
    .byte  25
    .byte  52
    .byte 129

@ void ScaleYCbCr42xToRGB565_BilinearY_Row_NEON(
@  yuv2rgb565_row_scale_bilinear_ctx *ctx, int dither);
@
@ ctx = {
@   uint16_t *rgb_row;       /*r0*/
@   const uint8_t *y_row;    /*r1*/
@   const uint8_t *u_row;    /*r2*/
@   const uint8_t *v_row;    /*r3*/
@   int y_yweight;           /*r4*/
@   int y_pitch;             /*r5*/
@   int width;               /*r6*/
@   int source_x0_q16;       /*r7*/
@   int source_dx_q16;       /*r8*/
@   int source_uv_xoffs_q16; /*r9*/
@ };
    .global ScaleYCbCr42xToRGB565_BilinearY_Row_NEON
    .type   ScaleYCbCr42xToRGB565_BilinearY_Row_NEON, %function
    .balign 64
    .fnstart
ScaleYCbCr42xToRGB565_BilinearY_Row_NEON:
    STMFD       r13!,{r4-r9,r14}       @ 8 words.
    ADR         r14,YCbCr42xToRGB565_DITHER03_CONSTS_NEON
    VPUSH       {Q4-Q7}                @ 16 words.
    ADD         r14,r14,r1, LSL #4     @ Select the dither table to use
    LDMIA       r0, {r0-r9}
    @ Set up image index registers.
    ADD         r12,r8, r8
    VMOV.I32    D16,#0         @ Q8 = < 2| 2| 0| 0>*source_dx_q16
    VDUP.32     D17,r12
    ADD         r12,r12,r12
    VTRN.32     D16,D17        @ Q2 = < 2| 0| 2| 0>*source_dx_q16
    VDUP.32     D19,r12        @ Q9 = < 4| 4| ?| ?>*source_dx_q16
    ADD         r12,r12,r12
    VDUP.32     Q0, r7         @ Q0 = < 1| 1| 1| 1>*source_x0_q16
    VADD.I32    D17,D17,D19    @ Q8 = < 6| 4| 2| 0>*source_dx_q16
    CMP         r8, #0                 @ If source_dx_q16 is negative...
    VDUP.32     Q9, r12        @ Q9 = < 8| 8| 8| 8>*source_dx_q16
    ADDLT       r7, r7, r8, LSL #4     @ Make r7 point to the end of the block
    VADD.I32    Q0, Q0, Q8     @ Q0 = < 6| 4| 2| 0>*source_dx_q16+source_x0_q16
    SUBLT       r7, r7, r8             @ (i.e., the lowest address we'll use)
    VADD.I32    Q1, Q0, Q9     @ Q1 = <14|12|10| 8>*source_dx_q16+source_x0_q16
    VDUP.I32    Q9, r8         @ Q8 = < 1| 1| 1| 1>*source_dx_q16
    VADD.I32    Q2, Q0, Q9     @ Q2 = < 7| 5| 3| 1>*source_dx_q16+source_x0_q16
    VADD.I32    Q3, Q1, Q9     @ Q3 = <15|13|11| 9>*source_dx_q16+source_x0_q16
    VLD1.64     {D30,D31},[r14,:128]   @ Load some constants
    VMOV.I8     D28,#52
    VMOV.I8     D29,#129
    @ The basic idea here is to do aligned loads of a block of data and then
    @  index into it using VTBL to extract the data from the source X
    @  coordinate corresponding to each destination pixel.
    @ This is significantly less code and significantly fewer cycles than doing
    @  a series of single-lane loads, but it means that the X step between
    @  pixels must be limited to 2.0 or less, otherwise we couldn't guarantee
    @  that we could read 8 pixels from a single aligned 32-byte block of data.
    @ Q0...Q3 contain the 16.16 fixed-point X coordinates of each pixel,
    @  separated into even pixels and odd pixels to make extracting offsets and
    @  weights easier.
    @ We then pull out two bytes from the middle of each coordinate: the top
    @  byte corresponds to the integer part of the X coordinate, and the bottom
    @  byte corresponds to the weight to use for bilinear blending.
    @ These are separated out into different registers with VTRN.
    @ Then by subtracting the integer X coordinate of the first pixel in the
    @  data block we loaded, we produce an index register suitable for use by
    @  VTBL.
s42xbily_neon_loop:
    @ Load the Y' data.
    MOV         r12,r7, ASR #16
    VRSHRN.S32  D16,Q0, #8
    AND         r12,r12,#~15   @ Read 16-byte aligned blocks
    VDUP.I8     D20,r12
    ADD         r12,r1, r12    @ r12 = y_row+(source_x&~7)
    VRSHRN.S32  D17,Q1, #8
    PLD         [r12,#64]
    VLD1.64     {D8, D9, D10,D11},[r12,:128],r5        @ Load Y' top row
    ADD         r14,r7, r8, LSL #3
    VRSHRN.S32  D18,Q2, #8
    MOV         r14,r14,ASR #16
    VRSHRN.S32  D19,Q3, #8
    AND         r14,r14,#~15   @ Read 16-byte aligned blocks
    VLD1.64     {D12,D13,D14,D15},[r12,:128]           @ Load Y' bottom row
    PLD         [r12,#64]
    VDUP.I8     D21,r14
    ADD         r14,r1, r14    @ r14 = y_row+(source_x&~7)
    VMOV.I8     Q13,#1
    PLD         [r14,#64]
    VTRN.8      Q8, Q9         @ Q8  = <wFwEwDwCwBwAw9w8w7w6w5w4w3w2w1w0>
                               @ Q9  = <xFxExDxCxBxAx9x8x7x6x5x4x3x2x1x0>
    VSUB.S8     Q9, Q9, Q10    @ Make offsets relative to the data we loaded.
    @ First 8 Y' pixels
    VTBL.8      D20,{D8, D9, D10,D11},D18      @ Index top row at source_x
    VTBL.8      D24,{D12,D13,D14,D15},D18      @ Index bottom row at source_x
    VADD.S8     Q13,Q9, Q13                    @ Add 1 to source_x
    VTBL.8      D22,{D8, D9, D10,D11},D26      @ Index top row at source_x+1
    VTBL.8      D26,{D12,D13,D14,D15},D26      @ Index bottom row at source_x+1
    @ Next 8 Y' pixels
    VLD1.64     {D8, D9, D10,D11},[r14,:128],r5        @ Load Y' top row
    VLD1.64     {D12,D13,D14,D15},[r14,:128]           @ Load Y' bottom row
    PLD         [r14,#64]
    VTBL.8      D21,{D8, D9, D10,D11},D19      @ Index top row at source_x
    VTBL.8      D25,{D12,D13,D14,D15},D19      @ Index bottom row at source_x
    VTBL.8      D23,{D8, D9, D10,D11},D27      @ Index top row at source_x+1
    VTBL.8      D27,{D12,D13,D14,D15},D27      @ Index bottom row at source_x+1
    @ Blend Y'.
    VDUP.I16    Q9, r4         @ Load the y weights.
    VSUBL.U8    Q4, D24,D20    @ Q5:Q4 = c-a
    VSUBL.U8    Q5, D25,D21
    VSUBL.U8    Q6, D26,D22    @ Q7:Q6 = d-b
    VSUBL.U8    Q7, D27,D23
    VMUL.S16    Q4, Q4, Q9     @ Q5:Q4 = (c-a)*yweight
    VMUL.S16    Q5, Q5, Q9
    VMUL.S16    Q6, Q6, Q9     @ Q7:Q6 = (d-b)*yweight
    VMUL.S16    Q7, Q7, Q9
    VMOVL.U8    Q12,D16        @ Promote the x weights to 16 bits.
    VMOVL.U8    Q13,D17        @ Sadly, there's no VMULW.
    VRSHRN.S16  D8, Q4, #8     @ Q4 = (c-a)*yweight+128>>8
    VRSHRN.S16  D9, Q5, #8
    VRSHRN.S16  D12,Q6, #8     @ Q6 = (d-b)*yweight+128>>8
    VRSHRN.S16  D13,Q7, #8
    VADD.I8     Q10,Q10,Q4     @ Q10 = a+((c-a)*yweight+128>>8)
    VADD.I8     Q11,Q11,Q6     @ Q11 = b+((d-b)*yweight+128>>8)
    VSUBL.U8    Q4, D22,D20    @ Q5:Q4 = b-a
    VSUBL.U8    Q5, D23,D21
    VMUL.S16    Q4, Q4, Q12    @ Q5:Q4 = (b-a)*xweight
    VMUL.S16    Q5, Q5, Q13
    VRSHRN.S16  D8, Q4, #8     @ Q4 = (b-a)*xweight+128>>8
    ADD         r12,r7, r9
    VRSHRN.S16  D9, Q5, #8
    MOV         r12,r12,ASR #17
    VADD.I8     Q8, Q10,Q4     @ Q8 = a+((b-a)*xweight+128>>8)
    @ Start extracting the chroma x coordinates, and load Cb and Cr.
    AND         r12,r12,#~15   @ Read 16-byte aligned blocks
    VDUP.I32    Q9, r9         @ Q9 = source_uv_xoffs_q16 x 4
    ADD         r14,r2, r12
    VADD.I32    Q10,Q0, Q9
    VLD1.64     {D8, D9, D10,D11},[r14,:128]   @ Load Cb
    PLD         [r14,#64]
    VADD.I32    Q11,Q1, Q9
    ADD         r14,r3, r12
    VADD.I32    Q12,Q2, Q9
    VLD1.64     {D12,D13,D14,D15},[r14,:128]   @ Load Cr
    PLD         [r14,#64]
    VADD.I32    Q13,Q3, Q9
    VRSHRN.S32  D20,Q10,#9     @ Q10 = <xEwExCwCxAwAx8w8x6w6x4w4x2w2x0w0>
    VRSHRN.S32  D21,Q11,#9
    VDUP.I8     Q9, r12
    VRSHRN.S32  D22,Q12,#9     @ Q11 = <xFwFxDwDxBwBx9w9x7w7x5w5x3w3x1w1>
    VRSHRN.S32  D23,Q13,#9
    @ We don't actually need the x weights, but we get them for free.
    @ Free ALU slot
    VTRN.8      Q10,Q11        @ Q10 = <wFwEwDwCwBwAw9w8w7w6w5w4w3w2w1w0>
    @ Free ALU slot            @ Q11 = <xFxExDxCxBxAx9x8x7x6x5x4x3x2x1x0>
    VSUB.S8     Q11,Q11,Q9     @ Make offsets relative to the data we loaded.
    VTBL.8      D18,{D8, D9, D10,D11},D22      @ Index Cb at source_x
    VMOV.I8     D24,#74
    VTBL.8      D19,{D8, D9, D10,D11},D23
    VMOV.I8     D26,#102
    VTBL.8      D20,{D12,D13,D14,D15},D22      @ Index Cr at source_x
    VMOV.I8     D27,#25
    VTBL.8      D21,{D12,D13,D14,D15},D23
    @ We now have Y' in Q8, Cb in Q9, and Cr in Q10
    @ We use VDUP to expand constants, because it's a permute instruction, so
    @  it can dual issue on the A8.
    SUBS        r6, r6, #16    @ width -= 16
    VMULL.U8    Q4, D16,D24    @  Q5:Q4  = Y'*74
    VDUP.32     Q6, D30[1]     @  Q7:Q6  = bias_G
    VMULL.U8    Q5, D17,D24
    VDUP.32     Q7, D30[1]
    VMLSL.U8    Q6, D18,D27    @  Q7:Q6  = -25*Cb+bias_G
    VDUP.32     Q11,D30[0]     @ Q12:Q11 = bias_R
    VMLSL.U8    Q7, D19,D27
    VDUP.32     Q12,D30[0]
    VMLAL.U8    Q11,D20,D26    @ Q12:Q11 = 102*Cr+bias_R
    VDUP.32     Q8, D31[0]     @ Q13:Q8  = bias_B
    VMLAL.U8    Q12,D21,D26
    VDUP.32     Q13,D31[0]
    VMLAL.U8    Q8, D18,D29    @ Q13:Q8  = 129*Cb+bias_B
    VMLAL.U8    Q13,D19,D29
    VMLSL.U8    Q6, D20,D28    @  Q7:Q6  = -25*Cb-52*Cr+bias_G
    VMLSL.U8    Q7, D21,D28
    VADD.S16    Q11,Q4, Q11    @ Q12:Q11 = 74*Y'+102*Cr+bias_R
    VADD.S16    Q12,Q5, Q12
    VQADD.S16   Q8, Q4, Q8     @ Q13:Q8  = 74*Y'+129*Cr+bias_B
    VQADD.S16   Q13,Q5, Q13
    VADD.S16    Q6, Q4, Q6     @  Q7:Q6  = 74*Y'-25*Cb-52*Cr+bias_G
    VADD.S16    Q7, Q5, Q7
    @ Push each value to the top of its word and saturate it.
    VQSHLU.S16 Q11,Q11,#2
    VQSHLU.S16 Q12,Q12,#2
    VQSHLU.S16 Q6, Q6, #2
    VQSHLU.S16 Q7, Q7, #2
    VQSHLU.S16 Q8, Q8, #2
    VQSHLU.S16 Q13,Q13,#2
    @ Merge G and B into R.
    VSRI.U16   Q11,Q6, #5
    VSRI.U16   Q12,Q7, #5
    VSRI.U16   Q11,Q8, #11
    MOV         r14,r8, LSL #4
    VSRI.U16   Q12,Q13,#11
    BLT s42xbily_neon_tail
    VDUP.I32    Q13,r14
    @ Store the result.
    VST1.16     {D22,D23,D24,D25},[r0]!
    BEQ s42xbily_neon_done
    @ Advance the x coordinates.
    VADD.I32    Q0, Q0, Q13
    VADD.I32    Q1, Q1, Q13
    ADD         r7, r14
    VADD.I32    Q2, Q2, Q13
    VADD.I32    Q3, Q3, Q13
    B s42xbily_neon_loop
s42xbily_neon_tail:
    @ We have between 1 and 15 pixels left to write.
    @ -r6 == the number of pixels we need to skip writing.
    @ Adjust r0 to point to the last one we need to write, because we're going
    @  to write them in reverse order.
    ADD         r0, r0, r6, LSL #1
    MOV         r14,#-2
    ADD         r0, r0, #30
    @ Skip past the ones we don't need to write.
    SUB         PC, PC, r6, LSL #2
    ORR         r0, r0, r0
    VST1.16     {D25[3]},[r0,:16],r14
    VST1.16     {D25[2]},[r0,:16],r14
    VST1.16     {D25[1]},[r0,:16],r14
    VST1.16     {D25[0]},[r0,:16],r14
    VST1.16     {D24[3]},[r0,:16],r14
    VST1.16     {D24[2]},[r0,:16],r14
    VST1.16     {D24[1]},[r0,:16],r14
    VST1.16     {D24[0]},[r0,:16],r14
    VST1.16     {D23[3]},[r0,:16],r14
    VST1.16     {D23[2]},[r0,:16],r14
    VST1.16     {D23[1]},[r0,:16],r14
    VST1.16     {D23[0]},[r0,:16],r14
    VST1.16     {D22[3]},[r0,:16],r14
    VST1.16     {D22[2]},[r0,:16],r14
    VST1.16     {D22[1]},[r0,:16],r14
    VST1.16     {D22[0]},[r0,:16]
s42xbily_neon_done:
    VPOP        {Q4-Q7}                @ 16 words.
    LDMFD       r13!,{r4-r9,PC}        @ 8 words.
    .fnend
    .size ScaleYCbCr42xToRGB565_BilinearY_Row_NEON, .-ScaleYCbCr42xToRGB565_BilinearY_Row_NEON

#if defined(__ELF__)&&defined(__linux__)
    .section .note.GNU-stack,"",%progbits
#endif