pixman-arm32-clang.patch

https://gitlab.freedesktop.org/pixman/pixman/-/issues/74

diff --git a/gfx/cairo/libpixman/src/pixman-arm-neon-asm-bilinear.S b/gfx/cairo/libpixman/src/pixman-arm-neon-asm-bilinear.S

--- a/gfx/cairo/libpixman/src/pixman-arm-neon-asm-bilinear.S

+++ b/gfx/cairo/libpixman/src/pixman-arm-neon-asm-bilinear.S

@@ -77,206 +77,206 @@

  * format conversion, and interpolation as separate macros which can be used

  * as the basic building blocks for constructing bilinear scanline functions.

*/

 .macro bilinear_load_8888 reg1, reg2, tmp

     mov       TMP1, X, asr #16

     add       X, X, UX

     add       TMP1, TOP, TMP1, asl #2

-    vld1.32   {reg1}, [TMP1], STRIDE

-    vld1.32   {reg2}, [TMP1]

+    vld1.32   {\reg1}, [TMP1], STRIDE

+    vld1.32   {\reg2}, [TMP1]

 .endm

 .macro bilinear_load_0565 reg1, reg2, tmp

     mov       TMP1, X, asr #16

     add       X, X, UX

     add       TMP1, TOP, TMP1, asl #1

-    vld1.32   {reg2[0]}, [TMP1], STRIDE

-    vld1.32   {reg2[1]}, [TMP1]

-    convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp

+    vld1.32   {\reg2[0]}, [TMP1], STRIDE

+    vld1.32   {\reg2[1]}, [TMP1]

+    convert_four_0565_to_x888_packed \reg2, \reg1, \reg2, \tmp

 .endm

 .macro bilinear_load_and_vertical_interpolate_two_8888 \

                     acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2

-    bilinear_load_8888 reg1, reg2, tmp1

-    vmull.u8  acc1, reg1, d28

-    vmlal.u8  acc1, reg2, d29

-    bilinear_load_8888 reg3, reg4, tmp2

-    vmull.u8  acc2, reg3, d28

-    vmlal.u8  acc2, reg4, d29

+    bilinear_load_8888 \reg1, \reg2, \tmp1

+    vmull.u8  \acc1, \reg1, d28

+    vmlal.u8  \acc1, \reg2, d29

+    bilinear_load_8888 \reg3, \reg4, \tmp2

+    vmull.u8  \acc2, \reg3, d28

+    vmlal.u8  \acc2, \reg4, d29

 .endm

 .macro bilinear_load_and_vertical_interpolate_four_8888 \

                 xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \

                 yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi

     bilinear_load_and_vertical_interpolate_two_8888 \

-                xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi

+                \xacc1, \xacc2, \xreg1, \xreg2, \xreg3, \xreg4, \xacc2lo, \xacc2hi

     bilinear_load_and_vertical_interpolate_two_8888 \

-                yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi

+                \yacc1, \yacc2, \yreg1, \yreg2, \yreg3, \yreg4, \yacc2lo, \yacc2hi

 .endm

 .macro bilinear_load_and_vertical_interpolate_two_0565 \

                 acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi

     mov       TMP1, X, asr #16

     add       X, X, UX

     add       TMP1, TOP, TMP1, asl #1

     mov       TMP2, X, asr #16

     add       X, X, UX

     add       TMP2, TOP, TMP2, asl #1

-    vld1.32   {acc2lo[0]}, [TMP1], STRIDE

-    vld1.32   {acc2hi[0]}, [TMP2], STRIDE

-    vld1.32   {acc2lo[1]}, [TMP1]

-    vld1.32   {acc2hi[1]}, [TMP2]

-    convert_0565_to_x888 acc2, reg3, reg2, reg1

-    vzip.u8   reg1, reg3

-    vzip.u8   reg2, reg4

-    vzip.u8   reg3, reg4

-    vzip.u8   reg1, reg2

-    vmull.u8  acc1, reg1, d28

-    vmlal.u8  acc1, reg2, d29

-    vmull.u8  acc2, reg3, d28

-    vmlal.u8  acc2, reg4, d29

+    vld1.32   {\acc2lo[0]}, [TMP1], STRIDE

+    vld1.32   {\acc2hi[0]}, [TMP2], STRIDE

+    vld1.32   {\acc2lo[1]}, [TMP1]

+    vld1.32   {\acc2hi[1]}, [TMP2]

+    convert_0565_to_x888 \acc2, \reg3, \reg2, \reg1

+    vzip.u8   \reg1, \reg3

+    vzip.u8   \reg2, \reg4

+    vzip.u8   \reg3, \reg4

+    vzip.u8   \reg1, \reg2

+    vmull.u8  \acc1, \reg1, d28

+    vmlal.u8  \acc1, \reg2, d29

+    vmull.u8  \acc2, \reg3, d28

+    vmlal.u8  \acc2, \reg4, d29

 .endm

 .macro bilinear_load_and_vertical_interpolate_four_0565 \

                 xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \

                 yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi

     mov       TMP1, X, asr #16

     add       X, X, UX

     add       TMP1, TOP, TMP1, asl #1

     mov       TMP2, X, asr #16

     add       X, X, UX

     add       TMP2, TOP, TMP2, asl #1

-    vld1.32   {xacc2lo[0]}, [TMP1], STRIDE

-    vld1.32   {xacc2hi[0]}, [TMP2], STRIDE

-    vld1.32   {xacc2lo[1]}, [TMP1]

-    vld1.32   {xacc2hi[1]}, [TMP2]

-    convert_0565_to_x888 xacc2, xreg3, xreg2, xreg1

+    vld1.32   {\xacc2lo[0]}, [TMP1], STRIDE

+    vld1.32   {\xacc2hi[0]}, [TMP2], STRIDE

+    vld1.32   {\xacc2lo[1]}, [TMP1]

+    vld1.32   {\xacc2hi[1]}, [TMP2]

+    convert_0565_to_x888 \xacc2, \xreg3, \xreg2, \xreg1

     mov       TMP1, X, asr #16

     add       X, X, UX

     add       TMP1, TOP, TMP1, asl #1

     mov       TMP2, X, asr #16

     add       X, X, UX

     add       TMP2, TOP, TMP2, asl #1

-    vld1.32   {yacc2lo[0]}, [TMP1], STRIDE

-    vzip.u8   xreg1, xreg3

-    vld1.32   {yacc2hi[0]}, [TMP2], STRIDE

-    vzip.u8   xreg2, xreg4

-    vld1.32   {yacc2lo[1]}, [TMP1]

-    vzip.u8   xreg3, xreg4

-    vld1.32   {yacc2hi[1]}, [TMP2]

-    vzip.u8   xreg1, xreg2

-    convert_0565_to_x888 yacc2, yreg3, yreg2, yreg1

-    vmull.u8  xacc1, xreg1, d28

-    vzip.u8   yreg1, yreg3

-    vmlal.u8  xacc1, xreg2, d29

-    vzip.u8   yreg2, yreg4

-    vmull.u8  xacc2, xreg3, d28

-    vzip.u8   yreg3, yreg4

-    vmlal.u8  xacc2, xreg4, d29

-    vzip.u8   yreg1, yreg2

-    vmull.u8  yacc1, yreg1, d28

-    vmlal.u8  yacc1, yreg2, d29

-    vmull.u8  yacc2, yreg3, d28

-    vmlal.u8  yacc2, yreg4, d29

+    vld1.32   {\yacc2lo[0]}, [TMP1], STRIDE

+    vzip.u8   \xreg1, \xreg3

+    vld1.32   {\yacc2hi[0]}, [TMP2], STRIDE

+    vzip.u8   \xreg2, \xreg4

+    vld1.32   {\yacc2lo[1]}, [TMP1]

+    vzip.u8   \xreg3, \xreg4

+    vld1.32   {\yacc2hi[1]}, [TMP2]

+    vzip.u8   \xreg1, \xreg2

+    convert_0565_to_x888 \yacc2, \yreg3, \yreg2, \yreg1

+    vmull.u8  \xacc1, \xreg1, d28

+    vzip.u8   \yreg1, \yreg3

+    vmlal.u8  \xacc1, \xreg2, d29

+    vzip.u8   \yreg2, \yreg4

+    vmull.u8  \xacc2, \xreg3, d28

+    vzip.u8   \yreg3, \yreg4

+    vmlal.u8  \xacc2, \xreg4, d29

+    vzip.u8   \yreg1, \yreg2

+    vmull.u8  \yacc1, \yreg1, d28

+    vmlal.u8  \yacc1, \yreg2, d29

+    vmull.u8  \yacc2, \yreg3, d28

+    vmlal.u8  \yacc2, \yreg4, d29

 .endm

 .macro bilinear_store_8888 numpix, tmp1, tmp2

-.if numpix == 4

+.if \numpix == 4

     vst1.32   {d0, d1}, [OUT]!

-.elseif numpix == 2

+.elseif \numpix == 2

     vst1.32   {d0}, [OUT]!

-.elseif numpix == 1

+.elseif \numpix == 1

     vst1.32   {d0[0]}, [OUT, :32]!

 .else

     .error bilinear_store_8888 numpix is unsupported

 .endif

 .endm

 .macro bilinear_store_0565 numpix, tmp1, tmp2

     vuzp.u8 d0, d1

     vuzp.u8 d2, d3

     vuzp.u8 d1, d3

     vuzp.u8 d0, d2

-    convert_8888_to_0565 d2, d1, d0, q1, tmp1, tmp2

-.if numpix == 4

+    convert_8888_to_0565 d2, d1, d0, q1, \tmp1, \tmp2

+.if \numpix == 4

     vst1.16   {d2}, [OUT]!

-.elseif numpix == 2

+.elseif \numpix == 2

     vst1.32   {d2[0]}, [OUT]!

-.elseif numpix == 1

+.elseif \numpix == 1

     vst1.16   {d2[0]}, [OUT]!

 .else

     .error bilinear_store_0565 numpix is unsupported

 .endif

 .endm

/*

  * Macros for loading mask pixels into register 'mask'.

  * vdup must be done in somewhere else.

*/

 .macro bilinear_load_mask_x numpix, mask

 .endm

 .macro bilinear_load_mask_8 numpix, mask

-.if numpix == 4

-    vld1.32     {mask[0]}, [MASK]!

-.elseif numpix == 2

-    vld1.16     {mask[0]}, [MASK]!

-.elseif numpix == 1

-    vld1.8      {mask[0]}, [MASK]!

+.if \numpix == 4

+    vld1.32     {\mask[0]}, [MASK]!

+.elseif \numpix == 2

+    vld1.16     {\mask[0]}, [MASK]!

+.elseif \numpix == 1

+    vld1.8      {\mask[0]}, [MASK]!

 .else

-    .error bilinear_load_mask_8 numpix is unsupported

+    .error bilinear_load_mask_8 \numpix is unsupported

 .endif

     pld         [MASK, #prefetch_offset]

 .endm

 .macro bilinear_load_mask mask_fmt, numpix, mask

-    bilinear_load_mask_&mask_fmt numpix, mask

+    bilinear_load_mask_\()\mask_fmt \numpix, \mask

 .endm

/*

  * Macros for loading destination pixels into register 'dst0' and 'dst1'.

  * Interleave should be done somewhere else.

*/

 .macro bilinear_load_dst_0565_src numpix, dst0, dst1, dst01

 .endm

 .macro bilinear_load_dst_8888_src numpix, dst0, dst1, dst01

 .endm

 .macro bilinear_load_dst_8888 numpix, dst0, dst1, dst01

-.if numpix == 4

-    vld1.32     {dst0, dst1}, [OUT]

-.elseif numpix == 2

-    vld1.32     {dst0}, [OUT]

-.elseif numpix == 1

-    vld1.32     {dst0[0]}, [OUT]

+.if \numpix == 4

+    vld1.32     {\dst0, \dst1}, [OUT]

+.elseif \numpix == 2

+    vld1.32     {\dst0}, [OUT]

+.elseif \numpix == 1

+    vld1.32     {\dst0[0]}, [OUT]

 .else

-    .error bilinear_load_dst_8888 numpix is unsupported

+    .error bilinear_load_dst_8888 \numpix is unsupported

 .endif

     pld         [OUT, #(prefetch_offset * 4)]

 .endm

 .macro bilinear_load_dst_8888_over numpix, dst0, dst1, dst01

-    bilinear_load_dst_8888 numpix, dst0, dst1, dst01

+    bilinear_load_dst_8888 \numpix, \dst0, \dst1, \dst01

 .endm

 .macro bilinear_load_dst_8888_add numpix, dst0, dst1, dst01

-    bilinear_load_dst_8888 numpix, dst0, dst1, dst01

+    bilinear_load_dst_8888 \numpix, \dst0, \dst1, \dst01

 .endm

 .macro bilinear_load_dst dst_fmt, op, numpix, dst0, dst1, dst01

-    bilinear_load_dst_&dst_fmt&_&op numpix, dst0, dst1, dst01

+    bilinear_load_dst_\()\dst_fmt\()_\()\op \numpix, \dst0, \dst1, \dst01

 .endm

/*

  * Macros for duplicating partially loaded mask to fill entire register.

  * We will apply mask to interleaved source pixels, that is

  *  (r0, r1, r2, r3, g0, g1, g2, g3) x (m0, m1, m2, m3, m0, m1, m2, m3)

  *  (b0, b1, b2, b3, a0, a1, a2, a3) x (m0, m1, m2, m3, m0, m1, m2, m3)

  * So, we need to duplicate loaded mask into whole register.

@@ -285,79 +285,79 @@

  *  (r0, r1, x, x, g0, g1, x, x) x (m0, m1, m0, m1, m0, m1, m0, m1)

  *  (b0, b1, x, x, a0, a1, x, x) x (m0, m1, m0, m1, m0, m1, m0, m1)

  * We can do some optimizations for this including last pixel cases.

*/

 .macro bilinear_duplicate_mask_x numpix, mask

 .endm

 .macro bilinear_duplicate_mask_8 numpix, mask

-.if numpix == 4

-    vdup.32     mask, mask[0]

-.elseif numpix == 2

-    vdup.16     mask, mask[0]

-.elseif numpix == 1

-    vdup.8      mask, mask[0]

+.if \numpix == 4

+    vdup.32     \mask, \mask[0]

+.elseif \numpix == 2

+    vdup.16     \mask, \mask[0]

+.elseif \numpix == 1

+    vdup.8      \mask, \mask[0]

 .else

     .error bilinear_duplicate_mask_8 is unsupported

 .endif

 .endm

 .macro bilinear_duplicate_mask mask_fmt, numpix, mask

-    bilinear_duplicate_mask_&mask_fmt numpix, mask

+    bilinear_duplicate_mask_\()\mask_fmt \numpix, \mask

 .endm

/*

  * Macros for interleaving src and dst pixels to rrrr gggg bbbb aaaa form.

  * Interleave should be done when maks is enabled or operator is 'over'.

*/

 .macro bilinear_interleave src0, src1, dst0, dst1

-    vuzp.8      src0, src1

-    vuzp.8      dst0, dst1

-    vuzp.8      src0, src1

-    vuzp.8      dst0, dst1

+    vuzp.8      \src0, \src1

+    vuzp.8      \dst0, \dst1

+    vuzp.8      \src0, \src1

+    vuzp.8      \dst0, \dst1

 .endm

 .macro bilinear_interleave_src_dst_x_src \

                 numpix, src0, src1, src01, dst0, dst1, dst01

 .endm

 .macro bilinear_interleave_src_dst_x_over \

                 numpix, src0, src1, src01, dst0, dst1, dst01

-    bilinear_interleave src0, src1, dst0, dst1

+    bilinear_interleave \src0, \src1, \dst0, \dst1

 .endm

 .macro bilinear_interleave_src_dst_x_add \

                 numpix, src0, src1, src01, dst0, dst1, dst01

 .endm

 .macro bilinear_interleave_src_dst_8_src \

                 numpix, src0, src1, src01, dst0, dst1, dst01

-    bilinear_interleave src0, src1, dst0, dst1

+    bilinear_interleave \src0, \src1, \dst0, \dst1

 .endm

 .macro bilinear_interleave_src_dst_8_over \

                 numpix, src0, src1, src01, dst0, dst1, dst01

-    bilinear_interleave src0, src1, dst0, dst1

+    bilinear_interleave \src0, \src1, \dst0, \dst1

 .endm

 .macro bilinear_interleave_src_dst_8_add \

                 numpix, src0, src1, src01, dst0, dst1, dst01

-    bilinear_interleave src0, src1, dst0, dst1

+    bilinear_interleave \src0, \src1, \dst0, \dst1

 .endm

 .macro bilinear_interleave_src_dst \

                 mask_fmt, op, numpix, src0, src1, src01, dst0, dst1, dst01

-    bilinear_interleave_src_dst_&mask_fmt&_&op \

-                numpix, src0, src1, src01, dst0, dst1, dst01

+    bilinear_interleave_src_dst_\()\mask_fmt\()_\()\op \

+                \numpix, \src0, \src1, \src01, \dst0, \dst1, \dst01

 .endm

/*

  * Macros for applying masks to src pixels. (see combine_mask_u() function)

  * src, dst should be in interleaved form.

  * mask register should be in form (m0, m1, m2, m3).

*/

@@ -365,217 +365,217 @@

                 numpix, src0, src1, src01, mask, \

                 tmp01, tmp23, tmp45, tmp67

 .endm

 .macro bilinear_apply_mask_to_src_8 \

                 numpix, src0, src1, src01, mask, \

                 tmp01, tmp23, tmp45, tmp67

-    vmull.u8        tmp01, src0, mask

-    vmull.u8        tmp23, src1, mask

+    vmull.u8        \tmp01, \src0, \mask

+    vmull.u8        \tmp23, \src1, \mask

     /* bubbles */

-    vrshr.u16       tmp45, tmp01, #8

-    vrshr.u16       tmp67, tmp23, #8

+    vrshr.u16       \tmp45, \tmp01, #8

+    vrshr.u16       \tmp67, \tmp23, #8

     /* bubbles */

-    vraddhn.u16     src0, tmp45, tmp01

-    vraddhn.u16     src1, tmp67, tmp23

+    vraddhn.u16     \src0, \tmp45, \tmp01

+    vraddhn.u16     \src1, \tmp67, \tmp23

 .endm

 .macro bilinear_apply_mask_to_src \

                 mask_fmt, numpix, src0, src1, src01, mask, \

                 tmp01, tmp23, tmp45, tmp67

-    bilinear_apply_mask_to_src_&mask_fmt \

-                numpix, src0, src1, src01, mask, \

-                tmp01, tmp23, tmp45, tmp67

+    bilinear_apply_mask_to_src_\()\mask_fmt \

+                \numpix, \src0, \src1, \src01, \mask, \

+                \tmp01, \tmp23, \tmp45, \tmp67

 .endm

/*

  * Macros for combining src and destination pixels.

  * Interleave or not is depending on operator 'op'.

*/

 .macro bilinear_combine_src \

                 numpix, src0, src1, src01, dst0, dst1, dst01, \

                 tmp01, tmp23, tmp45, tmp67, tmp8

 .endm

 .macro bilinear_combine_over \

                 numpix, src0, src1, src01, dst0, dst1, dst01, \

                 tmp01, tmp23, tmp45, tmp67, tmp8

-    vdup.32     tmp8, src1[1]

+    vdup.32     \tmp8, \src1[1]

     /* bubbles */

-    vmvn.8      tmp8, tmp8

+    vmvn.8      \tmp8, \tmp8

     /* bubbles */

-    vmull.u8    tmp01, dst0, tmp8

+    vmull.u8    \tmp01, \dst0, \tmp8

     /* bubbles */

-    vmull.u8    tmp23, dst1, tmp8

+    vmull.u8    \tmp23, \dst1, \tmp8

     /* bubbles */

-    vrshr.u16   tmp45, tmp01, #8

-    vrshr.u16   tmp67, tmp23, #8

+    vrshr.u16   \tmp45, \tmp01, #8

+    vrshr.u16   \tmp67, \tmp23, #8

     /* bubbles */

-    vraddhn.u16 dst0, tmp45, tmp01

-    vraddhn.u16 dst1, tmp67, tmp23

+    vraddhn.u16 \dst0, \tmp45, \tmp01

+    vraddhn.u16 \dst1, \tmp67, \tmp23

     /* bubbles */

-    vqadd.u8    src01, dst01, src01

+    vqadd.u8    \src01, \dst01, \src01

 .endm

 .macro bilinear_combine_add \

                 numpix, src0, src1, src01, dst0, dst1, dst01, \

                 tmp01, tmp23, tmp45, tmp67, tmp8

-    vqadd.u8    src01, dst01, src01

+    vqadd.u8    \src01, \dst01, \src01

 .endm

 .macro bilinear_combine \

                 op, numpix, src0, src1, src01, dst0, dst1, dst01, \

                 tmp01, tmp23, tmp45, tmp67, tmp8

-    bilinear_combine_&op \

-                numpix, src0, src1, src01, dst0, dst1, dst01, \

-                tmp01, tmp23, tmp45, tmp67, tmp8

+    bilinear_combine_\()\op \

+                \numpix, \src0, \src1, \src01, \dst0, \dst1, \dst01, \

+                \tmp01, \tmp23, \tmp45, \tmp67, \tmp8

 .endm

/*

  * Macros for final deinterleaving of destination pixels if needed.

*/

 .macro bilinear_deinterleave numpix, dst0, dst1, dst01

-    vuzp.8      dst0, dst1

+    vuzp.8      \dst0, \dst1

     /* bubbles */

-    vuzp.8      dst0, dst1

+    vuzp.8      \dst0, \dst1

 .endm

 .macro bilinear_deinterleave_dst_x_src numpix, dst0, dst1, dst01

 .endm

 .macro bilinear_deinterleave_dst_x_over numpix, dst0, dst1, dst01

-    bilinear_deinterleave numpix, dst0, dst1, dst01

+    bilinear_deinterleave \numpix, \dst0, \dst1, \dst01

 .endm

 .macro bilinear_deinterleave_dst_x_add numpix, dst0, dst1, dst01

 .endm

 .macro bilinear_deinterleave_dst_8_src numpix, dst0, dst1, dst01

-    bilinear_deinterleave numpix, dst0, dst1, dst01

+    bilinear_deinterleave \numpix, \dst0, \dst1, \dst01

 .endm

 .macro bilinear_deinterleave_dst_8_over numpix, dst0, dst1, dst01

-    bilinear_deinterleave numpix, dst0, dst1, dst01

+    bilinear_deinterleave \numpix, \dst0, \dst1, \dst01

 .endm

 .macro bilinear_deinterleave_dst_8_add numpix, dst0, dst1, dst01

-    bilinear_deinterleave numpix, dst0, dst1, dst01

+    bilinear_deinterleave \numpix, \dst0, \dst1, \dst01

 .endm

 .macro bilinear_deinterleave_dst mask_fmt, op, numpix, dst0, dst1, dst01

-    bilinear_deinterleave_dst_&mask_fmt&_&op numpix, dst0, dst1, dst01

+    bilinear_deinterleave_dst_\()\mask_fmt\()_\()\op \numpix, \dst0, \dst1, \dst01

 .endm

 .macro bilinear_interpolate_last_pixel src_fmt, mask_fmt, dst_fmt, op

-    bilinear_load_&src_fmt d0, d1, d2

-    bilinear_load_mask mask_fmt, 1, d4

-    bilinear_load_dst dst_fmt, op, 1, d18, d19, q9

+    bilinear_load_\()\src_fmt d0, d1, d2

+    bilinear_load_mask \mask_fmt, 1, d4

+    bilinear_load_dst \dst_fmt, \op, 1, d18, d19, q9

     vmull.u8  q1, d0, d28

     vmlal.u8  q1, d1, d29

     /* 5 cycles bubble */

     vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS

     vmlsl.u16 q0, d2, d30

     vmlal.u16 q0, d3, d30

     /* 5 cycles bubble */

-    bilinear_duplicate_mask mask_fmt, 1, d4

+    bilinear_duplicate_mask \mask_fmt, 1, d4

     vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)

     /* 3 cycles bubble */

     vmovn.u16 d0, q0

     /* 1 cycle bubble */

     bilinear_interleave_src_dst \

-                mask_fmt, op, 1, d0, d1, q0, d18, d19, q9

+                \mask_fmt, \op, 1, d0, d1, q0, d18, d19, q9

     bilinear_apply_mask_to_src \

-                mask_fmt, 1, d0, d1, q0, d4, \

+                \mask_fmt, 1, d0, d1, q0, d4, \

                 q3, q8, q10, q11

     bilinear_combine \

-                op, 1, d0, d1, q0, d18, d19, q9, \

+                \op, 1, d0, d1, q0, d18, d19, q9, \

                 q3, q8, q10, q11, d5

-    bilinear_deinterleave_dst mask_fmt, op, 1, d0, d1, q0

-    bilinear_store_&dst_fmt 1, q2, q3

+    bilinear_deinterleave_dst \mask_fmt, \op, 1, d0, d1, q0

+    bilinear_store_\()\dst_fmt 1, q2, q3

 .endm

 .macro bilinear_interpolate_two_pixels src_fmt, mask_fmt, dst_fmt, op

-    bilinear_load_and_vertical_interpolate_two_&src_fmt \

+    bilinear_load_and_vertical_interpolate_two_\()\src_fmt \

                 q1, q11, d0, d1, d20, d21, d22, d23

-    bilinear_load_mask mask_fmt, 2, d4

-    bilinear_load_dst dst_fmt, op, 2, d18, d19, q9

+    bilinear_load_mask \mask_fmt, 2, d4

+    bilinear_load_dst \dst_fmt, \op, 2, d18, d19, q9

     vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS

     vmlsl.u16 q0, d2, d30

     vmlal.u16 q0, d3, d30

     vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS

     vmlsl.u16 q10, d22, d31

     vmlal.u16 q10, d23, d31

     vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)

     vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS)

-    bilinear_duplicate_mask mask_fmt, 2, d4

+    bilinear_duplicate_mask \mask_fmt, 2, d4

     vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)

     vadd.u16  q12, q12, q13

     vmovn.u16 d0, q0

     bilinear_interleave_src_dst \

-                mask_fmt, op, 2, d0, d1, q0, d18, d19, q9

+                \mask_fmt, \op, 2, d0, d1, q0, d18, d19, q9

     bilinear_apply_mask_to_src \

-                mask_fmt, 2, d0, d1, q0, d4, \

+                \mask_fmt, 2, d0, d1, q0, d4, \

                 q3, q8, q10, q11

     bilinear_combine \

-                op, 2, d0, d1, q0, d18, d19, q9, \

+                \op, 2, d0, d1, q0, d18, d19, q9, \

                 q3, q8, q10, q11, d5

-    bilinear_deinterleave_dst mask_fmt, op, 2, d0, d1, q0

-    bilinear_store_&dst_fmt 2, q2, q3

+    bilinear_deinterleave_dst \mask_fmt, \op, 2, d0, d1, q0

+    bilinear_store_\()\dst_fmt 2, q2, q3

 .endm

 .macro bilinear_interpolate_four_pixels src_fmt, mask_fmt, dst_fmt, op

-    bilinear_load_and_vertical_interpolate_four_&src_fmt \

+    bilinear_load_and_vertical_interpolate_four_\()\src_fmt \

                 q1, q11, d0, d1, d20, d21, d22, d23 \

                 q3, q9,  d4, d5, d16, d17, d18, d19

     pld       [TMP1, PF_OFFS]

     sub       TMP1, TMP1, STRIDE

     vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS

     vmlsl.u16 q0, d2, d30

     vmlal.u16 q0, d3, d30

     vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS

     vmlsl.u16 q10, d22, d31

     vmlal.u16 q10, d23, d31

     vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)

     vshll.u16 q2, d6, #BILINEAR_INTERPOLATION_BITS

     vmlsl.u16 q2, d6, d30

     vmlal.u16 q2, d7, d30

     vshll.u16 q8, d18, #BILINEAR_INTERPOLATION_BITS

-    bilinear_load_mask mask_fmt, 4, d22

-    bilinear_load_dst dst_fmt, op, 4, d2, d3, q1

+    bilinear_load_mask \mask_fmt, 4, d22

+    bilinear_load_dst \dst_fmt, \op, 4, d2, d3, q1

     pld       [TMP1, PF_OFFS]

     vmlsl.u16 q8, d18, d31

     vmlal.u16 q8, d19, d31

     vadd.u16  q12, q12, q13

     vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)

     vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS)

     vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)

     vshrn.u32 d5, q8, #(2 * BILINEAR_INTERPOLATION_BITS)

-    bilinear_duplicate_mask mask_fmt, 4, d22

+    bilinear_duplicate_mask \mask_fmt, 4, d22

     vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)

     vmovn.u16 d0, q0

     vmovn.u16 d1, q2

     vadd.u16  q12, q12, q13

     bilinear_interleave_src_dst \

-                mask_fmt, op, 4, d0, d1, q0, d2, d3, q1

+                \mask_fmt, \op, 4, d0, d1, q0, d2, d3, q1

     bilinear_apply_mask_to_src \

-                mask_fmt, 4, d0, d1, q0, d22, \

+                \mask_fmt, 4, d0, d1, q0, d22, \

                 q3, q8, q9, q10

     bilinear_combine \

-                op, 4, d0, d1, q0, d2, d3, q1, \

+                \op, 4, d0, d1, q0, d2, d3, q1, \

                 q3, q8, q9, q10, d23

-    bilinear_deinterleave_dst mask_fmt, op, 4, d0, d1, q0

-    bilinear_store_&dst_fmt 4, q2, q3

+    bilinear_deinterleave_dst \mask_fmt, \op, 4, d0, d1, q0

+    bilinear_store_\()\dst_fmt 4, q2, q3

 .endm

 .set BILINEAR_FLAG_USE_MASK,		1

 .set BILINEAR_FLAG_USE_ALL_NEON_REGS,	2

/*

  * Main template macro for generating NEON optimized bilinear scanline functions.

@@ -605,24 +605,24 @@

 	bilinear_process_four_pixels, \

 	bilinear_process_pixblock_head, \

 	bilinear_process_pixblock_tail, \

 	bilinear_process_pixblock_tail_head, \

 	pixblock_size, \

 	prefetch_distance, \

 	flags

-pixman_asm_function fname

-.if pixblock_size == 8

-.elseif pixblock_size == 4

+pixman_asm_function \fname

+.if \pixblock_size == 8

+.elseif \pixblock_size == 4

 .else

     .error unsupported pixblock size

 .endif

-.if ((flags) & BILINEAR_FLAG_USE_MASK) == 0

+.if ((\flags) & BILINEAR_FLAG_USE_MASK) == 0

     OUT       .req    r0

     TOP       .req    r1

     BOTTOM    .req    r2

     WT        .req    r3

     WB        .req    r4

     X         .req    r5

     UX        .req    r6

     WIDTH     .req    ip

@@ -630,17 +630,17 @@ pixman_asm_function fname

     TMP2      .req    r4

     PF_OFFS   .req    r7

     TMP3      .req    r8

     TMP4      .req    r9

     STRIDE    .req    r2

     mov		ip, sp

     push	{r4, r5, r6, r7, r8, r9}

-    mov		PF_OFFS, #prefetch_distance

+    mov		PF_OFFS, #\prefetch_distance

     ldmia	ip, {WB, X, UX, WIDTH}

 .else

     OUT       .req      r0

     MASK      .req      r1

     TOP       .req      r2

     BOTTOM    .req      r3

     WT        .req      r4

     WB        .req      r5

@@ -649,27 +649,27 @@ pixman_asm_function fname

     WIDTH     .req      ip

     TMP1      .req      r4

     TMP2      .req      r5

     PF_OFFS   .req      r8

     TMP3      .req      r9

     TMP4      .req      r10

     STRIDE    .req      r3

-    .set prefetch_offset, prefetch_distance

+    .set prefetch_offset, \prefetch_distance

     mov       ip, sp

     push      {r4, r5, r6, r7, r8, r9, r10, ip}

-    mov       PF_OFFS, #prefetch_distance

+    mov       PF_OFFS, #\prefetch_distance

     ldmia     ip, {WT, WB, X, UX, WIDTH}

 .endif

     mul       PF_OFFS, PF_OFFS, UX

-.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0

+.if ((\flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0

     vpush     {d8-d15}

 .endif

     sub	      STRIDE, BOTTOM, TOP

     .unreq    BOTTOM

     cmp       WIDTH, #0

     ble       3f

@@ -678,76 +678,76 @@ pixman_asm_function fname

     vdup.u16  q13, UX

     vdup.u8   d28, WT

     vdup.u8   d29, WB

     vadd.u16  d25, d25, d26

     /* ensure good destination alignment  */

     cmp       WIDTH, #1

     blt       0f

-    tst       OUT, #(1 << dst_bpp_shift)

+    tst       OUT, #(1 << \dst_bpp_shift)

     beq       0f

     vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)

     vadd.u16  q12, q12, q13

-    bilinear_process_last_pixel

+    \bilinear_process_last_pixel

     sub       WIDTH, WIDTH, #1

0:

     vadd.u16  q13, q13, q13

     vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)

     vadd.u16  q12, q12, q13

     cmp       WIDTH, #2

     blt       0f

-    tst       OUT, #(1 << (dst_bpp_shift + 1))

+    tst       OUT, #(1 << (\dst_bpp_shift + 1))

     beq       0f

-    bilinear_process_two_pixels

+    \bilinear_process_two_pixels

     sub       WIDTH, WIDTH, #2

0:

-.if pixblock_size == 8

+.if \pixblock_size == 8

     cmp       WIDTH, #4

     blt       0f

-    tst       OUT, #(1 << (dst_bpp_shift + 2))

+    tst       OUT, #(1 << (\dst_bpp_shift + 2))

     beq       0f

-    bilinear_process_four_pixels

+    \bilinear_process_four_pixels

     sub       WIDTH, WIDTH, #4

0:

 .endif

-    subs      WIDTH, WIDTH, #pixblock_size

+    subs      WIDTH, WIDTH, #\pixblock_size

     blt       1f

-    mov       PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift)

-    bilinear_process_pixblock_head

-    subs      WIDTH, WIDTH, #pixblock_size

+    mov       PF_OFFS, PF_OFFS, asr #(16 - \src_bpp_shift)

+    \bilinear_process_pixblock_head

+    subs      WIDTH, WIDTH, #\pixblock_size

     blt       5f

0:

-    bilinear_process_pixblock_tail_head

-    subs      WIDTH, WIDTH, #pixblock_size

+    \bilinear_process_pixblock_tail_head

+    subs      WIDTH, WIDTH, #\pixblock_size

     bge       0b

5:

-    bilinear_process_pixblock_tail

+    \bilinear_process_pixblock_tail

1:

-.if pixblock_size == 8

+.if \pixblock_size == 8

     tst       WIDTH, #4

     beq       2f

-    bilinear_process_four_pixels

+    \bilinear_process_four_pixels

2:

 .endif

     /* handle the remaining trailing pixels */

     tst       WIDTH, #2

     beq       2f

-    bilinear_process_two_pixels

+    \bilinear_process_two_pixels

2:

     tst       WIDTH, #1

     beq       3f

-    bilinear_process_last_pixel

+    \bilinear_process_last_pixel

3:

-.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0

+.if ((\flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0

     vpop      {d8-d15}

 .endif

-.if ((flags) & BILINEAR_FLAG_USE_MASK) == 0

+.if ((\flags) & BILINEAR_FLAG_USE_MASK) == 0

     pop       {r4, r5, r6, r7, r8, r9}

 .else

     pop       {r4, r5, r6, r7, r8, r9, r10, ip}

 .endif

     bx        lr

     .unreq    OUT

     .unreq    TOP

@@ -757,21 +757,21 @@ 3:

     .unreq    UX

     .unreq    WIDTH

     .unreq    TMP1

     .unreq    TMP2

     .unreq    PF_OFFS

     .unreq    TMP3

     .unreq    TMP4

     .unreq    STRIDE

-.if ((flags) & BILINEAR_FLAG_USE_MASK) != 0

+.if ((\flags) & BILINEAR_FLAG_USE_MASK) != 0

     .unreq    MASK

 .endif

-.endfunc

+pixman_end_asm_function

 .endm

 /* src_8888_8_8888 */

 .macro bilinear_src_8888_8_8888_process_last_pixel

     bilinear_interpolate_last_pixel 8888, 8, 8888, src

 .endm

diff --git a/gfx/cairo/libpixman/src/pixman-arm-neon-asm.S b/gfx/cairo/libpixman/src/pixman-arm-neon-asm.S

--- a/gfx/cairo/libpixman/src/pixman-arm-neon-asm.S

+++ b/gfx/cairo/libpixman/src/pixman-arm-neon-asm.S

@@ -29,16 +29,22 @@

  * (those which are exposing some new or interesting features) are

  * extensively commented and can be used as examples.

  * You may want to have a look at the comments for following functions:

  *  - pixman_composite_over_8888_0565_asm_neon

  *  - pixman_composite_over_n_8_0565_asm_neon

*/

+#ifdef __clang__

+#define ldrgeb ldrbge

+#define subges subsge

+#define subpls subspl

+#endif

 /* Prevent the stack from becoming executable for no reason... */

 #if defined(__linux__) && defined(__ELF__)

 .section .note.GNU-stack,"",%progbits

 #endif

     .text

     .fpu neon

     .arch armv7a

@@ -255,43 +261,43 @@

         vqadd.u8    d16, d2, d20

     vld1.16     {d4, d5}, [DST_R, :128]!

         vqadd.u8    q9, q0, q11

     vshrn.u16   d6, q2, #8

     fetch_src_pixblock

     vshrn.u16   d7, q2, #3

     vsli.u16    q2, q2, #5

         vshll.u8    q14, d16, #8

-                                    PF add PF_X, PF_X, #8

+                                    PF add, PF_X, PF_X, #8

         vshll.u8    q8, d19, #8

-                                    PF tst PF_CTL, #0xF

+                                    PF tst, PF_CTL, #0xF

     vsri.u8     d6, d6, #5

-                                    PF addne PF_X, PF_X, #8

+                                    PF addne, PF_X, PF_X, #8

     vmvn.8      d3, d3

-                                    PF subne PF_CTL, PF_CTL, #1

+                                    PF subne, PF_CTL, PF_CTL, #1

     vsri.u8     d7, d7, #6

     vshrn.u16   d30, q2, #2

     vmull.u8    q10, d3, d6

                                     PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]

     vmull.u8    q11, d3, d7

     vmull.u8    q12, d3, d30

                                     PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]

         vsri.u16    q14, q8, #5

-                                    PF cmp PF_X, ORIG_W

+                                    PF cmp, PF_X, ORIG_W

         vshll.u8    q9, d18, #8

     vrshr.u16   q13, q10, #8

-                                    PF subge PF_X, PF_X, ORIG_W

+                                    PF subge, PF_X, PF_X, ORIG_W

     vrshr.u16   q3, q11, #8

     vrshr.u16   q15, q12, #8

-                                    PF subges PF_CTL, PF_CTL, #0x10

+                                    PF subges, PF_CTL, PF_CTL, #0x10

         vsri.u16    q14, q9, #11

-                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!

+                                    PF ldrgeb, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!

     vraddhn.u16 d20, q10, q13

     vraddhn.u16 d23, q11, q3

-                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!

+                                    PF ldrgeb, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!

     vraddhn.u16 d22, q12, q15

         vst1.16     {d28, d29}, [DST_W, :128]!

 .endm

 #else

 /* If we did not care much about the performance, we would just use this... */

 .macro pixman_composite_over_8888_0565_process_pixblock_tail_head

@@ -429,30 +435,30 @@ generate_composite_function \

 .macro pixman_composite_src_8888_0565_process_pixblock_tail

     vsri.u16    q14, q8, #5

     vsri.u16    q14, q9, #11

 .endm

 .macro pixman_composite_src_8888_0565_process_pixblock_tail_head

         vsri.u16    q14, q8, #5

-                                    PF add PF_X, PF_X, #8

-                                    PF tst PF_CTL, #0xF

+                                    PF add, PF_X, PF_X, #8

+                                    PF tst, PF_CTL, #0xF

     fetch_src_pixblock

-                                    PF addne PF_X, PF_X, #8

-                                    PF subne PF_CTL, PF_CTL, #1

+                                    PF addne, PF_X, PF_X, #8

+                                    PF subne, PF_CTL, PF_CTL, #1

         vsri.u16    q14, q9, #11

-                                    PF cmp PF_X, ORIG_W

+                                    PF cmp, PF_X, ORIG_W

                                     PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]

     vshll.u8    q8, d1, #8

         vst1.16     {d28, d29}, [DST_W, :128]!

-                                    PF subge PF_X, PF_X, ORIG_W

-                                    PF subges PF_CTL, PF_CTL, #0x10

+                                    PF subge, PF_X, PF_X, ORIG_W

+                                    PF subges, PF_CTL, PF_CTL, #0x10

     vshll.u8    q14, d2, #8

-                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!

+                                    PF ldrgeb, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!

     vshll.u8    q9, d0, #8

 .endm

 generate_composite_function \

     pixman_composite_src_8888_0565_asm_neon, 32, 0, 16, \

     FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \

     8, /* number of pixels, processed in a single block */ \

     10, /* prefetch distance */ \

@@ -504,30 +510,30 @@ generate_composite_function \

     vqadd.u8    q15, q1, q3

 .endm

 .macro pixman_composite_add_8_8_process_pixblock_tail

 .endm

 .macro pixman_composite_add_8_8_process_pixblock_tail_head

     fetch_src_pixblock

-                                    PF add PF_X, PF_X, #32

-                                    PF tst PF_CTL, #0xF

+                                    PF add, PF_X, PF_X, #32

+                                    PF tst, PF_CTL, #0xF

     vld1.8      {d4, d5, d6, d7}, [DST_R, :128]!

-                                    PF addne PF_X, PF_X, #32

-                                    PF subne PF_CTL, PF_CTL, #1

+                                    PF addne, PF_X, PF_X, #32

+                                    PF subne, PF_CTL, PF_CTL, #1

         vst1.8      {d28, d29, d30, d31}, [DST_W, :128]!

-                                    PF cmp PF_X, ORIG_W

+                                    PF cmp, PF_X, ORIG_W

                                     PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]

                                     PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]

-                                    PF subge PF_X, PF_X, ORIG_W

-                                    PF subges PF_CTL, PF_CTL, #0x10

+                                    PF subge, PF_X, PF_X, ORIG_W

+                                    PF subges, PF_CTL, PF_CTL, #0x10

     vqadd.u8    q14, q0, q2

-                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!

-                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!

+                                    PF ldrgeb, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!

+                                    PF ldrgeb, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!

     vqadd.u8    q15, q1, q3

 .endm

 generate_composite_function \

     pixman_composite_add_8_8_asm_neon, 8, 0, 8, \

     FLAG_DST_READWRITE, \

     32, /* number of pixels, processed in a single block */ \

     10, /* prefetch distance */ \

@@ -536,30 +542,30 @@ generate_composite_function \

     pixman_composite_add_8_8_process_pixblock_head, \

     pixman_composite_add_8_8_process_pixblock_tail, \

     pixman_composite_add_8_8_process_pixblock_tail_head

 /******************************************************************************/

 .macro pixman_composite_add_8888_8888_process_pixblock_tail_head

     fetch_src_pixblock

-                                    PF add PF_X, PF_X, #8

-                                    PF tst PF_CTL, #0xF

+                                    PF add, PF_X, PF_X, #8

+                                    PF tst, PF_CTL, #0xF

     vld1.32     {d4, d5, d6, d7}, [DST_R, :128]!

-                                    PF addne PF_X, PF_X, #8

-                                    PF subne PF_CTL, PF_CTL, #1

+                                    PF addne, PF_X, PF_X, #8

+                                    PF subne, PF_CTL, PF_CTL, #1

         vst1.32     {d28, d29, d30, d31}, [DST_W, :128]!

-                                    PF cmp PF_X, ORIG_W

+                                    PF cmp, PF_X, ORIG_W

                                     PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]

                                     PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]

-                                    PF subge PF_X, PF_X, ORIG_W

-                                    PF subges PF_CTL, PF_CTL, #0x10

+                                    PF subge, PF_X, PF_X, ORIG_W

+                                    PF subges, PF_CTL, PF_CTL, #0x10

     vqadd.u8    q14, q0, q2

-                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!

-                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!

+                                    PF ldrgeb, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!

+                                    PF ldrgeb, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!

     vqadd.u8    q15, q1, q3

 .endm

 generate_composite_function \

     pixman_composite_add_8888_8888_asm_neon, 32, 0, 32, \

     FLAG_DST_READWRITE, \

     8, /* number of pixels, processed in a single block */ \

     10, /* prefetch distance */ \

@@ -599,40 +605,40 @@ generate_composite_function_single_scanl

     vraddhn.u16 d29, q15, q9

     vraddhn.u16 d30, q12, q10

     vraddhn.u16 d31, q13, q11

 .endm

 .macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head

     vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!

         vrshr.u16   q14, q8, #8

-                                    PF add PF_X, PF_X, #8

-                                    PF tst PF_CTL, #0xF

+                                    PF add, PF_X, PF_X, #8

+                                    PF tst, PF_CTL, #0xF

         vrshr.u16   q15, q9, #8

         vrshr.u16   q12, q10, #8

         vrshr.u16   q13, q11, #8

-                                    PF addne PF_X, PF_X, #8

-                                    PF subne PF_CTL, PF_CTL, #1

+                                    PF addne, PF_X, PF_X, #8

+                                    PF subne, PF_CTL, PF_CTL, #1

         vraddhn.u16 d28, q14, q8

         vraddhn.u16 d29, q15, q9

-                                    PF cmp PF_X, ORIG_W

+                                    PF cmp, PF_X, ORIG_W

         vraddhn.u16 d30, q12, q10

         vraddhn.u16 d31, q13, q11

     fetch_src_pixblock

                                     PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]

     vmvn.8      d22, d3

                                     PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]

         vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!

-                                    PF subge PF_X, PF_X, ORIG_W

+                                    PF subge, PF_X, PF_X, ORIG_W

     vmull.u8    q8, d22, d4

-                                    PF subges PF_CTL, PF_CTL, #0x10

+                                    PF subsge, PF_CTL, PF_CTL, #0x10

     vmull.u8    q9, d22, d5

-                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!

+                                    PF ldrgeb, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!

     vmull.u8    q10, d22, d6

-                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!

+                                    PF ldrgeb, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!

     vmull.u8    q11, d22, d7

 .endm

 generate_composite_function_single_scanline \

     pixman_composite_scanline_out_reverse_asm_neon, 32, 0, 32, \

     FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \

     8, /* number of pixels, processed in a single block */ \

     default_init, \

@@ -651,42 +657,42 @@ generate_composite_function_single_scanl

     pixman_composite_out_reverse_8888_8888_process_pixblock_tail

     vqadd.u8    q14, q0, q14

     vqadd.u8    q15, q1, q15

 .endm

 .macro pixman_composite_over_8888_8888_process_pixblock_tail_head

     vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!

         vrshr.u16   q14, q8, #8

-                                    PF add PF_X, PF_X, #8

-                                    PF tst PF_CTL, #0xF

+                                    PF add, PF_X, PF_X, #8

+                                    PF tst, PF_CTL, #0xF

         vrshr.u16   q15, q9, #8

         vrshr.u16   q12, q10, #8

         vrshr.u16   q13, q11, #8

-                                    PF addne PF_X, PF_X, #8

-                                    PF subne PF_CTL, PF_CTL, #1

+                                    PF addne, PF_X, PF_X, #8

+                                    PF subne, PF_CTL, PF_CTL, #1

         vraddhn.u16 d28, q14, q8

         vraddhn.u16 d29, q15, q9

-                                    PF cmp PF_X, ORIG_W

+                                    PF cmp, PF_X, ORIG_W

         vraddhn.u16 d30, q12, q10

         vraddhn.u16 d31, q13, q11

         vqadd.u8    q14, q0, q14

         vqadd.u8    q15, q1, q15

     fetch_src_pixblock

                                     PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]

     vmvn.8      d22, d3

                                     PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]

         vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!

-                                    PF subge PF_X, PF_X, ORIG_W

+                                    PF subge, PF_X, PF_X, ORIG_W

     vmull.u8    q8, d22, d4

-                                    PF subges PF_CTL, PF_CTL, #0x10

+                                    PF subges, PF_CTL, PF_CTL, #0x10

     vmull.u8    q9, d22, d5

-                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!

+                                    PF ldrgeb, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!

     vmull.u8    q10, d22, d6

-                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!

+                                    PF ldrgeb, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!

     vmull.u8    q11, d22, d7

 .endm

 generate_composite_function \

     pixman_composite_over_8888_8888_asm_neon, 32, 0, 32, \

     FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \

     8, /* number of pixels, processed in a single block */ \

     5, /* prefetch distance */ \

@@ -737,30 +743,30 @@ generate_composite_function_single_scanl

         vrshr.u16   q2, q10, #8

         vrshr.u16   q3, q11, #8

         vraddhn.u16 d28, q14, q8

         vraddhn.u16 d29, q15, q9

         vraddhn.u16 d30, q2, q10

         vraddhn.u16 d31, q3, q11

     vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!

         vqadd.u8    q14, q0, q14

-                                    PF add PF_X, PF_X, #8

-                                    PF tst PF_CTL, #0x0F

-                                    PF addne PF_X, PF_X, #8

-                                    PF subne PF_CTL, PF_CTL, #1

+                                    PF add, PF_X, PF_X, #8

+                                    PF tst, PF_CTL, #0x0F

+                                    PF addne, PF_X, PF_X, #8

+                                    PF subne, PF_CTL, PF_CTL, #1

         vqadd.u8    q15, q1, q15

-                                    PF cmp PF_X, ORIG_W

+                                    PF cmp, PF_X, ORIG_W

     vmull.u8    q8, d24, d4

                                     PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]

     vmull.u8    q9, d24, d5

-                                    PF subge PF_X, PF_X, ORIG_W

+                                    PF subge, PF_X, PF_X, ORIG_W

     vmull.u8    q10, d24, d6

-                                    PF subges PF_CTL, PF_CTL, #0x10

+                                    PF subges, PF_CTL, PF_CTL, #0x10

     vmull.u8    q11, d24, d7

-                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!

+                                    PF ldrgeb, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!

         vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!

 .endm

 .macro pixman_composite_over_n_8888_init

     add         DUMMY, sp, #ARGS_STACK_OFFSET

     vld1.32     {d3[0]}, [DUMMY]

     vdup.8      d0, d3[0]

     vdup.8      d1, d3[1]

@@ -779,40 +785,40 @@ generate_composite_function \

     pixman_composite_over_8888_8888_process_pixblock_head, \

     pixman_composite_over_8888_8888_process_pixblock_tail, \

     pixman_composite_over_n_8888_process_pixblock_tail_head

 /******************************************************************************/

 .macro pixman_composite_over_reverse_n_8888_process_pixblock_tail_head

         vrshr.u16   q14, q8, #8

-                                    PF add PF_X, PF_X, #8

-                                    PF tst PF_CTL, #0xF

+                                    PF add, PF_X, PF_X, #8

+                                    PF tst, PF_CTL, #0xF

         vrshr.u16   q15, q9, #8

         vrshr.u16   q12, q10, #8

         vrshr.u16   q13, q11, #8

-                                    PF addne PF_X, PF_X, #8

-                                    PF subne PF_CTL, PF_CTL, #1

+                                    PF addne, PF_X, PF_X, #8

+                                    PF subne, PF_CTL, PF_CTL, #1

         vraddhn.u16 d28, q14, q8

         vraddhn.u16 d29, q15, q9

-                                    PF cmp PF_X, ORIG_W

+                                    PF cmp, PF_X, ORIG_W

         vraddhn.u16 d30, q12, q10

         vraddhn.u16 d31, q13, q11

         vqadd.u8    q14, q0, q14

         vqadd.u8    q15, q1, q15

     vld4.8      {d0, d1, d2, d3}, [DST_R, :128]!

     vmvn.8      d22, d3

                                     PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]

         vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!

-                                    PF subge PF_X, PF_X, ORIG_W

+                                    PF subge, PF_X, PF_X, ORIG_W

     vmull.u8    q8, d22, d4

-                                    PF subges PF_CTL, PF_CTL, #0x10

+                                    PF subges, PF_CTL, PF_CTL, #0x10

     vmull.u8    q9, d22, d5

     vmull.u8    q10, d22, d6

-                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!

+                                    PF ldrgeb, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!

     vmull.u8    q11, d22, d7

 .endm

 .macro pixman_composite_over_reverse_n_8888_init

     add         DUMMY, sp, #ARGS_STACK_OFFSET

     vld1.32     {d7[0]}, [DUMMY]

     vdup.8      d4, d7[0]

     vdup.8      d5, d7[1]

@@ -1240,33 +1246,33 @@ generate_composite_function \

     vrshrn.u16  d28, q8, #8

     vrshrn.u16  d29, q9, #8

     vrshrn.u16  d30, q10, #8

     vrshrn.u16  d31, q11, #8

 .endm

 .macro pixman_composite_src_n_8_8888_process_pixblock_tail_head

     fetch_mask_pixblock

-                                    PF add PF_X, PF_X, #8

+                                    PF add, PF_X, PF_X, #8

         vrshrn.u16  d28, q8, #8

-                                    PF tst PF_CTL, #0x0F

+                                    PF tst, PF_CTL, #0x0F

         vrshrn.u16  d29, q9, #8

-                                    PF addne PF_X, PF_X, #8

+                                    PF addne, PF_X, PF_X, #8

         vrshrn.u16  d30, q10, #8

-                                    PF subne PF_CTL, PF_CTL, #1

+                                    PF subne, PF_CTL, PF_CTL, #1

         vrshrn.u16  d31, q11, #8

-                                    PF cmp PF_X, ORIG_W

+                                    PF cmp, PF_X, ORIG_W

     vmull.u8    q8, d24, d0

                                     PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]

     vmull.u8    q9, d24, d1

-                                    PF subge PF_X, PF_X, ORIG_W

+                                    PF subge, PF_X, PF_X, ORIG_W

     vmull.u8    q10, d24, d2

-                                    PF subges PF_CTL, PF_CTL, #0x10

+                                    PF subges, PF_CTL, PF_CTL, #0x10

     vmull.u8    q11, d24, d3

-                                    PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!

+                                    PF ldrgeb, DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!

         vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!

     vrsra.u16   q8, q8, #8

     vrsra.u16   q9, q9, #8

     vrsra.u16   q10, q10, #8

     vrsra.u16   q11, q11, #8

 .endm

 .macro pixman_composite_src_n_8_8888_init

@@ -1309,33 +1315,33 @@ generate_composite_function \

     vrshrn.u16  d28, q0, #8

     vrshrn.u16  d29, q1, #8

     vrshrn.u16  d30, q2, #8

     vrshrn.u16  d31, q3, #8

 .endm

 .macro pixman_composite_src_n_8_8_process_pixblock_tail_head

     fetch_mask_pixblock

-                                    PF add PF_X, PF_X, #8

+                                    PF add, PF_X, PF_X, #8

         vrshrn.u16  d28, q0, #8

-                                    PF tst PF_CTL, #0x0F

+                                    PF tst, PF_CTL, #0x0F

         vrshrn.u16  d29, q1, #8

-                                    PF addne PF_X, PF_X, #8

+                                    PF addne, PF_X, PF_X, #8

         vrshrn.u16  d30, q2, #8

-                                    PF subne PF_CTL, PF_CTL, #1

+                                    PF subne, PF_CTL, PF_CTL, #1

         vrshrn.u16  d31, q3, #8

-                                    PF cmp PF_X, ORIG_W

+                                    PF cmp, PF_X, ORIG_W

     vmull.u8    q0,  d24, d16

                                     PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]

     vmull.u8    q1,  d25, d16

-                                    PF subge PF_X, PF_X, ORIG_W

+                                    PF subge, PF_X, PF_X, ORIG_W

     vmull.u8    q2,  d26, d16

-                                    PF subges PF_CTL, PF_CTL, #0x10

+                                    PF subges, PF_CTL, PF_CTL, #0x10

     vmull.u8    q3,  d27, d16

-                                    PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!

+                                    PF ldrgeb, DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!

         vst1.8      {d28, d29, d30, d31}, [DST_W, :128]!

     vrsra.u16   q0, q0,  #8

     vrsra.u16   q1, q1,  #8

     vrsra.u16   q2, q2,  #8

     vrsra.u16   q3, q3,  #8

 .endm

 .macro pixman_composite_src_n_8_8_init

@@ -1403,37 +1409,37 @@ generate_composite_function \

 .endm

 .macro pixman_composite_over_n_8_8888_process_pixblock_tail_head

         vrshr.u16   q14, q8, #8

     vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!

         vrshr.u16   q15, q9, #8

     fetch_mask_pixblock

         vrshr.u16   q6, q10, #8

-                                    PF add PF_X, PF_X, #8

+                                    PF add, PF_X, PF_X, #8

         vrshr.u16   q7, q11, #8

-                                    PF tst PF_CTL, #0x0F

+                                    PF tst, PF_CTL, #0x0F

         vraddhn.u16 d28, q14, q8

-                                    PF addne PF_X, PF_X, #8

+                                    PF addne, PF_X, PF_X, #8

         vraddhn.u16 d29, q15, q9

-                                    PF subne PF_CTL, PF_CTL, #1

+                                    PF subne, PF_CTL, PF_CTL, #1

         vraddhn.u16 d30, q6, q10

-                                    PF cmp PF_X, ORIG_W

+                                    PF cmp, PF_X, ORIG_W

         vraddhn.u16 d31, q7, q11

                                     PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]

     vmull.u8    q6, d24, d8

                                     PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]

     vmull.u8    q7, d24, d9

-                                    PF subge PF_X, PF_X, ORIG_W

+                                    PF subge, PF_X, PF_X, ORIG_W

     vmull.u8    q8, d24, d10

-                                    PF subges PF_CTL, PF_CTL, #0x10

+                                    PF subges, PF_CTL, PF_CTL, #0x10

     vmull.u8    q9, d24, d11

-                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!

+                                    PF ldrgeb, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!

         vqadd.u8    q14, q0, q14

-                                    PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!

+                                    PF ldrgeb, DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!

         vqadd.u8    q15, q1, q15

     vrshr.u16   q10, q6, #8

     vrshr.u16   q11, q7, #8

     vrshr.u16   q12, q8, #8

     vrshr.u16   q13, q9, #8

     vraddhn.u16 d0, q6, q10

     vraddhn.u16 d1, q7, q11

     vraddhn.u16 d2, q8, q12

@@ -2420,31 +2426,31 @@ generate_composite_function \

 .macro pixman_composite_src_pixbuf_8888_process_pixblock_tail_head

         vrshr.u16   q11, q8, #8

         vswp        d3, d31

         vrshr.u16   q12, q9, #8

         vrshr.u16   q13, q10, #8

     fetch_src_pixblock

         vraddhn.u16 d30, q11, q8

-                                    PF add PF_X, PF_X, #8

-                                    PF tst PF_CTL, #0xF

-                                    PF addne PF_X, PF_X, #8

-                                    PF subne PF_CTL, PF_CTL, #1

+                                    PF add, PF_X, PF_X, #8

+                                    PF tst, PF_CTL, #0xF

+                                    PF addne, PF_X, PF_X, #8

+                                    PF subne, PF_CTL, PF_CTL, #1

         vraddhn.u16 d29, q12, q9

         vraddhn.u16 d28, q13, q10

     vmull.u8    q8, d3, d0

     vmull.u8    q9, d3, d1

     vmull.u8    q10, d3, d2

         vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!

-                                    PF cmp PF_X, ORIG_W

+                                    PF cmp, PF_X, ORIG_W

                                     PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]

-                                    PF subge PF_X, PF_X, ORIG_W

-                                    PF subges PF_CTL, PF_CTL, #0x10

-                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!

+                                    PF subge, PF_X, PF_X, ORIG_W

+                                    PF subges, PF_CTL, PF_CTL, #0x10

+                                    PF ldrgeb, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!

 .endm

 generate_composite_function \

     pixman_composite_src_pixbuf_8888_asm_neon, 32, 0, 32, \

     FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \

     8, /* number of pixels, processed in a single block */ \

     10, /* prefetch distance */ \

     default_init, \

@@ -2477,31 +2483,31 @@ generate_composite_function \

 .macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail_head

         vrshr.u16   q11, q8, #8

         vswp        d3, d31

         vrshr.u16   q12, q9, #8

         vrshr.u16   q13, q10, #8

     fetch_src_pixblock

         vraddhn.u16 d28, q11, q8

-                                    PF add PF_X, PF_X, #8

-                                    PF tst PF_CTL, #0xF

-                                    PF addne PF_X, PF_X, #8

-                                    PF subne PF_CTL, PF_CTL, #1

+                                    PF add, PF_X, PF_X, #8

+                                    PF tst, PF_CTL, #0xF

+                                    PF addne, PF_X, PF_X, #8

+                                    PF subne, PF_CTL, PF_CTL, #1

         vraddhn.u16 d29, q12, q9

         vraddhn.u16 d30, q13, q10

     vmull.u8    q8, d3, d0

     vmull.u8    q9, d3, d1

     vmull.u8    q10, d3, d2

         vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!

-                                    PF cmp PF_X, ORIG_W

+                                    PF cmp, PF_X, ORIG_W

                                     PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]

-                                    PF subge PF_X, PF_X, ORIG_W

-                                    PF subges PF_CTL, PF_CTL, #0x10

-                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!

+                                    PF subge, PF_X, PF_X, ORIG_W

+                                    PF subges, PF_CTL, PF_CTL, #0x10

+                                    PF ldrgeb, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!

 .endm

 generate_composite_function \

     pixman_composite_src_rpixbuf_8888_asm_neon, 32, 0, 32, \

     FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \

     8, /* number of pixels, processed in a single block */ \

     10, /* prefetch distance */ \

     default_init, \

@@ -2836,182 +2842,182 @@ generate_composite_function_nearest_scan

  * format conversion, and interpolation as separate macros which can be used

  * as the basic building blocks for constructing bilinear scanline functions.

*/

 .macro bilinear_load_8888 reg1, reg2, tmp

     mov       TMP1, X, asr #16

     add       X, X, UX

     add       TMP1, TOP, TMP1, asl #2

-    vld1.32   {reg1}, [TMP1], STRIDE

-    vld1.32   {reg2}, [TMP1]

+    vld1.32   {\reg1}, [TMP1], STRIDE

+    vld1.32   {\reg2}, [TMP1]

 .endm

 .macro bilinear_load_0565 reg1, reg2, tmp

     mov       TMP1, X, asr #16

     add       X, X, UX

     add       TMP1, TOP, TMP1, asl #1

-    vld1.32   {reg2[0]}, [TMP1], STRIDE

-    vld1.32   {reg2[1]}, [TMP1]

-    convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp

+    vld1.32   {\reg2[0]}, [TMP1], STRIDE

+    vld1.32   {\reg2[1]}, [TMP1]

+    convert_four_0565_to_x888_packed \reg2, \reg1, \reg2, \tmp

 .endm

 .macro bilinear_load_and_vertical_interpolate_two_8888 \

                     acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2

-    bilinear_load_8888 reg1, reg2, tmp1

-    vmull.u8  acc1, reg1, d28

-    vmlal.u8  acc1, reg2, d29

-    bilinear_load_8888 reg3, reg4, tmp2

-    vmull.u8  acc2, reg3, d28

-    vmlal.u8  acc2, reg4, d29

+    bilinear_load_8888 \reg1, \reg2, \tmp1

+    vmull.u8  \acc1, \reg1, d28

+    vmlal.u8  \acc1, \reg2, d29

+    bilinear_load_8888 \reg3, \reg4, \tmp2

+    vmull.u8  \acc2, \reg3, d28

+    vmlal.u8  \acc2, \reg4, d29

 .endm

 .macro bilinear_load_and_vertical_interpolate_four_8888 \

                 xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \

                 yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi

     bilinear_load_and_vertical_interpolate_two_8888 \

-                xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi

+                \xacc1, \xacc2, \xreg1, \xreg2, \xreg3, \xreg4, \xacc2lo, \xacc2hi

     bilinear_load_and_vertical_interpolate_two_8888 \

-                yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi

+                \yacc1, \yacc2, \yreg1, \yreg2, \yreg3, \yreg4, \yacc2lo, \yacc2hi

 .endm

 .macro bilinear_load_and_vertical_interpolate_two_0565 \

                 acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi

     mov       TMP1, X, asr #16

     add       X, X, UX

     add       TMP1, TOP, TMP1, asl #1

     mov       TMP2, X, asr #16

     add       X, X, UX

     add       TMP2, TOP, TMP2, asl #1

-    vld1.32   {acc2lo[0]}, [TMP1], STRIDE

-    vld1.32   {acc2hi[0]}, [TMP2], STRIDE

-    vld1.32   {acc2lo[1]}, [TMP1]

-    vld1.32   {acc2hi[1]}, [TMP2]

-    convert_0565_to_x888 acc2, reg3, reg2, reg1

-    vzip.u8   reg1, reg3

-    vzip.u8   reg2, reg4

-    vzip.u8   reg3, reg4

-    vzip.u8   reg1, reg2

-    vmull.u8  acc1, reg1, d28

-    vmlal.u8  acc1, reg2, d29

-    vmull.u8  acc2, reg3, d28

-    vmlal.u8  acc2, reg4, d29

+    vld1.32   {\acc2lo[0]}, [TMP1], STRIDE

+    vld1.32   {\acc2hi[0]}, [TMP2], STRIDE

+    vld1.32   {\acc2lo[1]}, [TMP1]

+    vld1.32   {\acc2hi[1]}, [TMP2]

+    convert_0565_to_x888 \acc2, \reg3, \reg2, \reg1

+    vzip.u8   \reg1, \reg3

+    vzip.u8   \reg2, \reg4

+    vzip.u8   \reg3, \reg4

+    vzip.u8   \reg1, \reg2

+    vmull.u8  \acc1, \reg1, d28

+    vmlal.u8  \acc1, \reg2, d29

+    vmull.u8  \acc2, \reg3, d28

+    vmlal.u8  \acc2, \reg4, d29

 .endm

 .macro bilinear_load_and_vertical_interpolate_four_0565 \

                 xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \

                 yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi

     mov       TMP1, X, asr #16

     add       X, X, UX

     add       TMP1, TOP, TMP1, asl #1

     mov       TMP2, X, asr #16

     add       X, X, UX

     add       TMP2, TOP, TMP2, asl #1

-    vld1.32   {xacc2lo[0]}, [TMP1], STRIDE

-    vld1.32   {xacc2hi[0]}, [TMP2], STRIDE

-    vld1.32   {xacc2lo[1]}, [TMP1]

-    vld1.32   {xacc2hi[1]}, [TMP2]

-    convert_0565_to_x888 xacc2, xreg3, xreg2, xreg1

+    vld1.32   {\xacc2lo[0]}, [TMP1], STRIDE

+    vld1.32   {\xacc2hi[0]}, [TMP2], STRIDE

+    vld1.32   {\xacc2lo[1]}, [TMP1]

+    vld1.32   {\xacc2hi[1]}, [TMP2]

+    convert_0565_to_x888 \xacc2, \xreg3, \xreg2, \xreg1

     mov       TMP1, X, asr #16

     add       X, X, UX

     add       TMP1, TOP, TMP1, asl #1

     mov       TMP2, X, asr #16

     add       X, X, UX

     add       TMP2, TOP, TMP2, asl #1

-    vld1.32   {yacc2lo[0]}, [TMP1], STRIDE

-    vzip.u8   xreg1, xreg3

-    vld1.32   {yacc2hi[0]}, [TMP2], STRIDE

-    vzip.u8   xreg2, xreg4

-    vld1.32   {yacc2lo[1]}, [TMP1]

-    vzip.u8   xreg3, xreg4

-    vld1.32   {yacc2hi[1]}, [TMP2]

-    vzip.u8   xreg1, xreg2

-    convert_0565_to_x888 yacc2, yreg3, yreg2, yreg1

-    vmull.u8  xacc1, xreg1, d28

-    vzip.u8   yreg1, yreg3

-    vmlal.u8  xacc1, xreg2, d29

-    vzip.u8   yreg2, yreg4

-    vmull.u8  xacc2, xreg3, d28

-    vzip.u8   yreg3, yreg4

-    vmlal.u8  xacc2, xreg4, d29

-    vzip.u8   yreg1, yreg2

-    vmull.u8  yacc1, yreg1, d28

-    vmlal.u8  yacc1, yreg2, d29

-    vmull.u8  yacc2, yreg3, d28

-    vmlal.u8  yacc2, yreg4, d29

+    vld1.32   {\yacc2lo[0]}, [TMP1], STRIDE

+    vzip.u8   \xreg1, \xreg3

+    vld1.32   {\yacc2hi[0]}, [TMP2], STRIDE

+    vzip.u8   \xreg2, \xreg4

+    vld1.32   {\yacc2lo[1]}, [TMP1]

+    vzip.u8   \xreg3, \xreg4

+    vld1.32   {\yacc2hi[1]}, [TMP2]

+    vzip.u8   \xreg1, \xreg2

+    convert_0565_to_x888 \yacc2, \yreg3, \yreg2, \yreg1

+    vmull.u8  \xacc1, \xreg1, d28

+    vzip.u8   \yreg1, \yreg3

+    vmlal.u8  \xacc1, \xreg2, d29

+    vzip.u8   \yreg2, \yreg4

+    vmull.u8  \xacc2, \xreg3, d28

+    vzip.u8   \yreg3, \yreg4

+    vmlal.u8  \xacc2, \xreg4, d29

+    vzip.u8   \yreg1, \yreg2

+    vmull.u8  \yacc1, \yreg1, d28

+    vmlal.u8  \yacc1, \yreg2, d29

+    vmull.u8  \yacc2, \yreg3, d28

+    vmlal.u8  \yacc2, \yreg4, d29

 .endm

 .macro bilinear_store_8888 numpix, tmp1, tmp2

-.if numpix == 4

+.if \numpix == 4

     vst1.32   {d0, d1}, [OUT, :128]!

-.elseif numpix == 2

+.elseif \numpix == 2

     vst1.32   {d0}, [OUT, :64]!

-.elseif numpix == 1

+.elseif \numpix == 1

     vst1.32   {d0[0]}, [OUT, :32]!

 .else

-    .error bilinear_store_8888 numpix is unsupported

+    .error bilinear_store_8888 \numpix is unsupported

 .endif

 .endm

 .macro bilinear_store_0565 numpix, tmp1, tmp2

     vuzp.u8 d0, d1

     vuzp.u8 d2, d3

     vuzp.u8 d1, d3

     vuzp.u8 d0, d2

-    convert_8888_to_0565 d2, d1, d0, q1, tmp1, tmp2

-.if numpix == 4

+    convert_8888_to_0565 d2, d1, d0, q1, \tmp1, \tmp2

+.if \numpix == 4

     vst1.16   {d2}, [OUT, :64]!

-.elseif numpix == 2

+.elseif \numpix == 2

     vst1.32   {d2[0]}, [OUT, :32]!

-.elseif numpix == 1

+.elseif \numpix == 1

     vst1.16   {d2[0]}, [OUT, :16]!

 .else

-    .error bilinear_store_0565 numpix is unsupported

+    .error bilinear_store_0565 \numpix is unsupported

 .endif

 .endm

 .macro bilinear_interpolate_last_pixel src_fmt, dst_fmt

-    bilinear_load_&src_fmt d0, d1, d2

+    bilinear_load_\()\src_fmt d0, d1, d2

     vmull.u8  q1, d0, d28

     vmlal.u8  q1, d1, d29

     /* 5 cycles bubble */

     vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS

     vmlsl.u16 q0, d2, d30

     vmlal.u16 q0, d3, d30

     /* 5 cycles bubble */

     vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)

     /* 3 cycles bubble */

     vmovn.u16 d0, q0

     /* 1 cycle bubble */

-    bilinear_store_&dst_fmt 1, q2, q3

+    bilinear_store_\()\dst_fmt 1, q2, q3

 .endm

 .macro bilinear_interpolate_two_pixels src_fmt, dst_fmt

-    bilinear_load_and_vertical_interpolate_two_&src_fmt \

+    bilinear_load_and_vertical_interpolate_two_\()\src_fmt \

                 q1, q11, d0, d1, d20, d21, d22, d23

     vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS

     vmlsl.u16 q0, d2, d30

     vmlal.u16 q0, d3, d30

     vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS

     vmlsl.u16 q10, d22, d31

     vmlal.u16 q10, d23, d31

     vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)

     vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS)

     vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)

     vadd.u16  q12, q12, q13

     vmovn.u16 d0, q0

-    bilinear_store_&dst_fmt 2, q2, q3

+    bilinear_store_\()\dst_fmt 2, q2, q3

 .endm

 .macro bilinear_interpolate_four_pixels src_fmt, dst_fmt

-    bilinear_load_and_vertical_interpolate_four_&src_fmt \

+    bilinear_load_and_vertical_interpolate_four_\()\src_fmt \

                 q1, q11, d0, d1, d20, d21, d22, d23 \

                 q3, q9,  d4, d5, d16, d17, d18, d19

     pld       [TMP1, PF_OFFS]

     sub       TMP1, TMP1, STRIDE

     vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS

     vmlsl.u16 q0, d2, d30

     vmlal.u16 q0, d3, d30

     vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS

@@ -3029,64 +3035,64 @@ generate_composite_function_nearest_scan

     vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)

     vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS)

     vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)

     vshrn.u32 d5, q8, #(2 * BILINEAR_INTERPOLATION_BITS)

     vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)

     vmovn.u16 d0, q0

     vmovn.u16 d1, q2

     vadd.u16  q12, q12, q13

-    bilinear_store_&dst_fmt 4, q2, q3

+    bilinear_store_\()\dst_fmt 4, q2, q3

 .endm

 .macro bilinear_interpolate_four_pixels_head src_fmt, dst_fmt

-.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt

-    bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_head

+.ifdef have_bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt

+    bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt\()_head

 .else

-    bilinear_interpolate_four_pixels src_fmt, dst_fmt

+    bilinear_interpolate_four_pixels \src_fmt, \dst_fmt

 .endif

 .endm

 .macro bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt

-.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt

-    bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_tail

+.ifdef have_bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt

+    bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt\()_tail

 .endif

 .endm

 .macro bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt

-.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt

-    bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_tail_head

+.ifdef have_bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt

+    bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt\()_tail_head

 .else

-    bilinear_interpolate_four_pixels src_fmt, dst_fmt

+    bilinear_interpolate_four_pixels \src_fmt, \dst_fmt

 .endif

 .endm

 .macro bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt

-.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt

-    bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_head

+.ifdef have_bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt

+    bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt\()_head

 .else

-    bilinear_interpolate_four_pixels_head src_fmt, dst_fmt

-    bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt

+    bilinear_interpolate_four_pixels_head \src_fmt, \dst_fmt

+    bilinear_interpolate_four_pixels_tail_head \src_fmt, \dst_fmt

 .endif

 .endm

 .macro bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt

-.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt

-    bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_tail

+.ifdef have_bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt

+    bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt\()_tail

 .else

-    bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt

+    bilinear_interpolate_four_pixels_tail \src_fmt, \dst_fmt

 .endif

 .endm

 .macro bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt

-.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt

-    bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_tail_head

+.ifdef have_bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt

+    bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt\()_tail_head

 .else

-    bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt

-    bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt

+    bilinear_interpolate_four_pixels_tail_head \src_fmt, \dst_fmt

+    bilinear_interpolate_four_pixels_tail_head \src_fmt, \dst_fmt

 .endif

 .endm

 .set BILINEAR_FLAG_UNROLL_4,          0

 .set BILINEAR_FLAG_UNROLL_8,          1

 .set BILINEAR_FLAG_USE_ALL_NEON_REGS, 2

/*

@@ -3101,17 +3107,17 @@ generate_composite_function_nearest_scan

  *  prefetch_distance - prefetch in the source image by that many

  *                      pixels ahead

*/

 .macro generate_bilinear_scanline_func fname, src_fmt, dst_fmt, \

                                        src_bpp_shift, dst_bpp_shift, \

                                        prefetch_distance, flags

-pixman_asm_function fname

+pixman_asm_function \fname

     OUT       .req      r0

     TOP       .req      r1

     BOTTOM    .req      r2

     WT        .req      r3

     WB        .req      r4

     X         .req      r5

     UX        .req      r6

     WIDTH     .req      ip

@@ -3119,21 +3125,21 @@ pixman_asm_function fname

     TMP2      .req      r4

     PF_OFFS   .req      r7

     TMP3      .req      r8

     TMP4      .req      r9

     STRIDE    .req      r2

     mov       ip, sp

     push      {r4, r5, r6, r7, r8, r9}

-    mov       PF_OFFS, #prefetch_distance

+    mov       PF_OFFS, #\prefetch_distance

     ldmia     ip, {WB, X, UX, WIDTH}

     mul       PF_OFFS, PF_OFFS, UX

-.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0

+.if ((\flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0

     vpush     {d8-d15}

 .endif

     sub       STRIDE, BOTTOM, TOP

     .unreq    BOTTOM

     cmp       WIDTH, #0

     ble       3f

@@ -3146,83 +3152,83 @@ pixman_asm_function fname

     /* ensure good destination alignment  */

     cmp       WIDTH, #1

     blt       0f

     tst       OUT, #(1 << dst_bpp_shift)

     beq       0f

     vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)

     vadd.u16  q12, q12, q13

-    bilinear_interpolate_last_pixel src_fmt, dst_fmt

+    bilinear_interpolate_last_pixel \src_fmt, \dst_fmt

     sub       WIDTH, WIDTH, #1

0:

     vadd.u16  q13, q13, q13

     vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)

     vadd.u16  q12, q12, q13

     cmp       WIDTH, #2

     blt       0f

     tst       OUT, #(1 << (dst_bpp_shift + 1))

     beq       0f

-    bilinear_interpolate_two_pixels src_fmt, dst_fmt

+    bilinear_interpolate_two_pixels \src_fmt, \dst_fmt

     sub       WIDTH, WIDTH, #2

0:

-.if ((flags) & BILINEAR_FLAG_UNROLL_8) != 0

+.if ((\flags) & BILINEAR_FLAG_UNROLL_8) != 0

 /*********** 8 pixels per iteration *****************/

     cmp       WIDTH, #4

     blt       0f

     tst       OUT, #(1 << (dst_bpp_shift + 2))

     beq       0f

-    bilinear_interpolate_four_pixels src_fmt, dst_fmt

+    bilinear_interpolate_four_pixels \src_fmt, \dst_fmt

     sub       WIDTH, WIDTH, #4

0:

     subs      WIDTH, WIDTH, #8

     blt       1f

     mov       PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift)

-    bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt

+    bilinear_interpolate_eight_pixels_head \src_fmt, \dst_fmt

     subs      WIDTH, WIDTH, #8

     blt       5f

0:

-    bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt

+    bilinear_interpolate_eight_pixels_tail_head \src_fmt, \dst_fmt

     subs      WIDTH, WIDTH, #8

     bge       0b

5:

-    bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt

+    bilinear_interpolate_eight_pixels_tail \src_fmt, \dst_fmt

1:

     tst       WIDTH, #4

     beq       2f

-    bilinear_interpolate_four_pixels src_fmt, dst_fmt

+    bilinear_interpolate_four_pixels \src_fmt, \dst_fmt

2:

 .else

 /*********** 4 pixels per iteration *****************/

     subs      WIDTH, WIDTH, #4

     blt       1f

     mov       PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift)

-    bilinear_interpolate_four_pixels_head src_fmt, dst_fmt

+    bilinear_interpolate_four_pixels_head \src_fmt, \dst_fmt

     subs      WIDTH, WIDTH, #4

     blt       5f

0:

-    bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt

+    bilinear_interpolate_four_pixels_tail_head \src_fmt, \dst_fmt

     subs      WIDTH, WIDTH, #4

     bge       0b

5:

-    bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt

+    bilinear_interpolate_four_pixels_tail \src_fmt, \dst_fmt

1:

 /****************************************************/

 .endif

     /* handle the remaining trailing pixels */

     tst       WIDTH, #2

     beq       2f

-    bilinear_interpolate_two_pixels src_fmt, dst_fmt

+    bilinear_interpolate_two_pixels \src_fmt, \dst_fmt

2:

     tst       WIDTH, #1

     beq       3f

-    bilinear_interpolate_last_pixel src_fmt, dst_fmt

+    bilinear_interpolate_last_pixel \src_fmt, \dst_fmt

3:

-.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0

+.if ((\flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0

     vpop      {d8-d15}

 .endif

     pop       {r4, r5, r6, r7, r8, r9}

     bx        lr

     .unreq    OUT

     .unreq    TOP

     .unreq    WT

@@ -3231,17 +3237,17 @@ 3:

     .unreq    UX

     .unreq    WIDTH

     .unreq    TMP1

     .unreq    TMP2

     .unreq    PF_OFFS

     .unreq    TMP3

     .unreq    TMP4

     .unreq    STRIDE

-.endfunc

+pixman_end_asm_function

 .endm

 /*****************************************************************************/

 .set have_bilinear_interpolate_four_pixels_8888_8888, 1

 .macro bilinear_interpolate_four_pixels_8888_8888_head

diff --git a/gfx/cairo/libpixman/src/pixman-arm-neon-asm.h b/gfx/cairo/libpixman/src/pixman-arm-neon-asm.h

--- a/gfx/cairo/libpixman/src/pixman-arm-neon-asm.h

+++ b/gfx/cairo/libpixman/src/pixman-arm-neon-asm.h

@@ -69,303 +69,303 @@

 .set PREFETCH_TYPE_ADVANCED,   2 /* Advanced fine-grained prefetch */

/*

  * Definitions of supplementary pixld/pixst macros (for partial load/store of

  * pixel data).

*/

 .macro pixldst1 op, elem_size, reg1, mem_operand, abits

-.if abits > 0

-    op&.&elem_size {d&reg1}, [&mem_operand&, :&abits&]!

+.if \abits > 0

+    \op\().\()\elem_size {d\()\reg1}, [\()\mem_operand\(), :\()\abits\()]!

 .else

-    op&.&elem_size {d&reg1}, [&mem_operand&]!

+    \op\().\()\elem_size {d\()\reg1}, [\()\mem_operand\()]!

 .endif

 .endm

 .macro pixldst2 op, elem_size, reg1, reg2, mem_operand, abits

-.if abits > 0

-    op&.&elem_size {d&reg1, d&reg2}, [&mem_operand&, :&abits&]!

+.if \abits > 0

+    \op\().\()\elem_size {d\()\reg1, d\()\reg2}, [\()\mem_operand\(), :\()\abits\()]!

 .else

-    op&.&elem_size {d&reg1, d&reg2}, [&mem_operand&]!

+    \op\().\()\elem_size {d\()\reg1, d\()\reg2}, [\()\mem_operand\()]!

 .endif

 .endm

 .macro pixldst4 op, elem_size, reg1, reg2, reg3, reg4, mem_operand, abits

-.if abits > 0

-    op&.&elem_size {d&reg1, d&reg2, d&reg3, d&reg4}, [&mem_operand&, :&abits&]!

+.if \abits > 0

+    \op\().\()\elem_size {d\()\reg1, d\()\reg2, d\()\reg3, d\()\reg4}, [\()\mem_operand\(), :\()\abits\()]!

 .else

-    op&.&elem_size {d&reg1, d&reg2, d&reg3, d&reg4}, [&mem_operand&]!

+    \op\().\()\elem_size {d\()\reg1, d\()\reg2, d\()\reg3, d\()\reg4}, [\()\mem_operand\()]!

 .endif

 .endm

 .macro pixldst0 op, elem_size, reg1, idx, mem_operand, abits

-    op&.&elem_size {d&reg1[idx]}, [&mem_operand&]!

+    \op\().\()\elem_size {d\()\reg1[\idx]}, [\()\mem_operand\()]!

 .endm

 .macro pixldst3 op, elem_size, reg1, reg2, reg3, mem_operand

-    op&.&elem_size {d&reg1, d&reg2, d&reg3}, [&mem_operand&]!

+    \op\().\()\elem_size {d\()\reg1, d\()\reg2, d\()\reg3}, [\()\mem_operand\()]!

 .endm

 .macro pixldst30 op, elem_size, reg1, reg2, reg3, idx, mem_operand

-    op&.&elem_size {d&reg1[idx], d&reg2[idx], d&reg3[idx]}, [&mem_operand&]!

+    \op\().\()\elem_size {d\()\reg1[\idx], d\()\reg2[\idx], d\()\reg3[\idx]}, [\()\mem_operand\()]!

 .endm

 .macro pixldst numbytes, op, elem_size, basereg, mem_operand, abits

-.if numbytes == 32

-    pixldst4 op, elem_size, %(basereg+4), %(basereg+5), \

-                              %(basereg+6), %(basereg+7), mem_operand, abits

-.elseif numbytes == 16

-    pixldst2 op, elem_size, %(basereg+2), %(basereg+3), mem_operand, abits

-.elseif numbytes == 8

-    pixldst1 op, elem_size, %(basereg+1), mem_operand, abits

-.elseif numbytes == 4

-    .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 32)

-        pixldst0 op, 32, %(basereg+0), 1, mem_operand, abits

-    .elseif elem_size == 16

-        pixldst0 op, 16, %(basereg+0), 2, mem_operand, abits

-        pixldst0 op, 16, %(basereg+0), 3, mem_operand, abits

+.if \numbytes == 32

+    pixldst4 \op, \elem_size, %(\basereg+4), %(\basereg+5), \

+                              %(\basereg+6), %(\basereg+7), \mem_operand, \abits

+.elseif \numbytes == 16

+    pixldst2 \op, \elem_size, %(\basereg+2), %(\basereg+3), \mem_operand, \abits

+.elseif \numbytes == 8

+    pixldst1 \op, \elem_size, %(\basereg+1), \mem_operand, \abits

+.elseif \numbytes == 4

+    .if !RESPECT_STRICT_ALIGNMENT || (\elem_size == 32)

+        pixldst0 \op, 32, %(\basereg+0), 1, \mem_operand, \abits

+    .elseif \elem_size == 16

+        pixldst0 \op, 16, %(\basereg+0), 2, \mem_operand, \abits

+        pixldst0 \op, 16, %(\basereg+0), 3, \mem_operand, \abits

     .else

-        pixldst0 op, 8, %(basereg+0), 4, mem_operand, abits

-        pixldst0 op, 8, %(basereg+0), 5, mem_operand, abits

-        pixldst0 op, 8, %(basereg+0), 6, mem_operand, abits

-        pixldst0 op, 8, %(basereg+0), 7, mem_operand, abits

+        pixldst0 \op, 8, %(\basereg+0), 4, \mem_operand, \abits

+        pixldst0 \op, 8, %(\basereg+0), 5, \mem_operand, \abits

+        pixldst0 \op, 8, %(\basereg+0), 6, \mem_operand, \abits

+        pixldst0 \op, 8, %(\basereg+0), 7, \mem_operand, \abits

     .endif

-.elseif numbytes == 2

-    .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 16)

-        pixldst0 op, 16, %(basereg+0), 1, mem_operand, abits

+.elseif \numbytes == 2

+    .if !RESPECT_STRICT_ALIGNMENT || (\elem_size == 16)

+        pixldst0 \op, 16, %(\basereg+0), 1, \mem_operand, \abits

     .else

-        pixldst0 op, 8, %(basereg+0), 2, mem_operand, abits

-        pixldst0 op, 8, %(basereg+0), 3, mem_operand, abits

+        pixldst0 \op, 8, %(\basereg+0), 2, \mem_operand, \abits

+        pixldst0 \op, 8, %(\basereg+0), 3, \mem_operand, \abits

     .endif

-.elseif numbytes == 1

-    pixldst0 op, 8, %(basereg+0), 1, mem_operand, abits

+.elseif \numbytes == 1

+    pixldst0 \op, 8, %(\basereg+0), 1, \mem_operand, \abits

 .else

-    .error "unsupported size: numbytes"

+    .error "unsupported size: \numbytes"

 .endif

 .endm

 .macro pixld numpix, bpp, basereg, mem_operand, abits=0

-.if bpp > 0

-.if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)

-    pixldst4 vld4, 8, %(basereg+4), %(basereg+5), \

-                      %(basereg+6), %(basereg+7), mem_operand, abits

-.elseif (bpp == 24) && (numpix == 8)

-    pixldst3 vld3, 8, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand

-.elseif (bpp == 24) && (numpix == 4)

-    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand

-    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand

-    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand

-    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand

-.elseif (bpp == 24) && (numpix == 2)

-    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand

-    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand

-.elseif (bpp == 24) && (numpix == 1)

-    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand

+.if \bpp > 0

+.if (\bpp == 32) && (\numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)

+    pixldst4 vld4, 8, %(\basereg+4), %(\basereg+5), \

+                      %(\basereg+6), %(\basereg+7), \mem_operand, \abits

+.elseif (\bpp == 24) && (\numpix == 8)

+    pixldst3 vld3, 8, %(\basereg+3), %(\basereg+4), %(\basereg+5), \mem_operand

+.elseif (\bpp == 24) && (\numpix == 4)

+    pixldst30 vld3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 4, \mem_operand

+    pixldst30 vld3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 5, \mem_operand

+    pixldst30 vld3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 6, \mem_operand

+    pixldst30 vld3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 7, \mem_operand

+.elseif (\bpp == 24) && (\numpix == 2)

+    pixldst30 vld3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 2, \mem_operand

+    pixldst30 vld3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 3, \mem_operand

+.elseif (\bpp == 24) && (\numpix == 1)

+    pixldst30 vld3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 1, \mem_operand

 .else

-    pixldst %(numpix * bpp / 8), vld1, %(bpp), basereg, mem_operand, abits

+    pixldst %(\numpix * \bpp / 8), vld1, %(\bpp), \basereg, \mem_operand, \abits

 .endif

 .endif

 .endm

 .macro pixst numpix, bpp, basereg, mem_operand, abits=0

-.if bpp > 0

-.if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)

-    pixldst4 vst4, 8, %(basereg+4), %(basereg+5), \

-                      %(basereg+6), %(basereg+7), mem_operand, abits

-.elseif (bpp == 24) && (numpix == 8)

-    pixldst3 vst3, 8, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand

-.elseif (bpp == 24) && (numpix == 4)

-    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand

-    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand

-    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand

-    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand

-.elseif (bpp == 24) && (numpix == 2)

-    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand

-    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand

-.elseif (bpp == 24) && (numpix == 1)

-    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand

+.if \bpp > 0

+.if (\bpp == 32) && (\numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)

+    pixldst4 vst4, 8, %(\basereg+4), %(\basereg+5), \

+                      %(\basereg+6), %(\basereg+7), \mem_operand, \abits

+.elseif (\bpp == 24) && (\numpix == 8)

+    pixldst3 vst3, 8, %(\basereg+3), %(\basereg+4), %(\basereg+5), \mem_operand

+.elseif (\bpp == 24) && (\numpix == 4)

+    pixldst30 vst3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 4, \mem_operand

+    pixldst30 vst3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 5, \mem_operand

+    pixldst30 vst3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 6, \mem_operand

+    pixldst30 vst3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 7, \mem_operand

+.elseif (\bpp == 24) && (\numpix == 2)

+    pixldst30 vst3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 2, \mem_operand

+    pixldst30 vst3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 3, \mem_operand

+.elseif (\bpp == 24) && (\numpix == 1)

+    pixldst30 vst3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 1, \mem_operand

 .else

-    pixldst %(numpix * bpp / 8), vst1, %(bpp), basereg, mem_operand, abits

+    pixldst %(\numpix * \bpp / 8), vst1, %(\bpp), \basereg, \mem_operand, \abits

 .endif

 .endif

 .endm

 .macro pixld_a numpix, bpp, basereg, mem_operand

-.if (bpp * numpix) <= 128

-    pixld numpix, bpp, basereg, mem_operand, %(bpp * numpix)

+.if (\bpp * \numpix) <= 128

+    pixld \numpix, \bpp, \basereg, \mem_operand, %(\bpp * \numpix)

 .else

-    pixld numpix, bpp, basereg, mem_operand, 128

+    pixld \numpix, \bpp, \basereg, \mem_operand, 128

 .endif

 .endm

 .macro pixst_a numpix, bpp, basereg, mem_operand

-.if (bpp * numpix) <= 128

-    pixst numpix, bpp, basereg, mem_operand, %(bpp * numpix)

+.if (\bpp * \numpix) <= 128

+    pixst \numpix, \bpp, \basereg, \mem_operand, %(\bpp * \numpix)

 .else

-    pixst numpix, bpp, basereg, mem_operand, 128

+    pixst \numpix, \bpp, \basereg, \mem_operand, 128

 .endif

 .endm

/*

  * Pixel fetcher for nearest scaling (needs TMP1, TMP2, VX, UNIT_X register

  * aliases to be defined)

*/

 .macro pixld1_s elem_size, reg1, mem_operand

-.if elem_size == 16

+.if \elem_size == 16

     mov     TMP1, VX, asr #16

     adds    VX, VX, UNIT_X

 5:  subpls  VX, VX, SRC_WIDTH_FIXED

     bpl     5b

-    add     TMP1, mem_operand, TMP1, asl #1

+    add     TMP1, \mem_operand, TMP1, asl #1

     mov     TMP2, VX, asr #16

     adds    VX, VX, UNIT_X

 5:  subpls  VX, VX, SRC_WIDTH_FIXED

     bpl     5b

-    add     TMP2, mem_operand, TMP2, asl #1

-    vld1.16 {d&reg1&[0]}, [TMP1, :16]

+    add     TMP2, \mem_operand, TMP2, asl #1

+    vld1.16 {d\()\reg1\()[0]}, [TMP1, :16]

     mov     TMP1, VX, asr #16

     adds    VX, VX, UNIT_X

 5:  subpls  VX, VX, SRC_WIDTH_FIXED

     bpl     5b

-    add     TMP1, mem_operand, TMP1, asl #1

-    vld1.16 {d&reg1&[1]}, [TMP2, :16]

+    add     TMP1, \mem_operand, TMP1, asl #1

+    vld1.16 {d\()\reg1\()[1]}, [TMP2, :16]

     mov     TMP2, VX, asr #16

     adds    VX, VX, UNIT_X

 5:  subpls  VX, VX, SRC_WIDTH_FIXED

     bpl     5b

-    add     TMP2, mem_operand, TMP2, asl #1

-    vld1.16 {d&reg1&[2]}, [TMP1, :16]

-    vld1.16 {d&reg1&[3]}, [TMP2, :16]

-.elseif elem_size == 32

+    add     TMP2, \mem_operand, TMP2, asl #1

+    vld1.16 {d\()\reg1\()[2]}, [TMP1, :16]

+    vld1.16 {d\()\reg1\()[3]}, [TMP2, :16]

+.elseif \elem_size == 32

     mov     TMP1, VX, asr #16

     adds    VX, VX, UNIT_X

 5:  subpls  VX, VX, SRC_WIDTH_FIXED

     bpl     5b

-    add     TMP1, mem_operand, TMP1, asl #2

+    add     TMP1, \mem_operand, TMP1, asl #2

     mov     TMP2, VX, asr #16

     adds    VX, VX, UNIT_X

 5:  subpls  VX, VX, SRC_WIDTH_FIXED

     bpl     5b

-    add     TMP2, mem_operand, TMP2, asl #2

-    vld1.32 {d&reg1&[0]}, [TMP1, :32]

-    vld1.32 {d&reg1&[1]}, [TMP2, :32]

+    add     TMP2, \mem_operand, TMP2, asl #2

+    vld1.32 {d\()\reg1\()[0]}, [TMP1, :32]

+    vld1.32 {d\()\reg1\()[1]}, [TMP2, :32]

 .else

     .error "unsupported"

 .endif

 .endm

 .macro pixld2_s elem_size, reg1, reg2, mem_operand

 .if 0 /* elem_size == 32 */

     mov     TMP1, VX, asr #16

     add     VX, VX, UNIT_X, asl #1

-    add     TMP1, mem_operand, TMP1, asl #2

+    add     TMP1, \mem_operand, TMP1, asl #2

     mov     TMP2, VX, asr #16

     sub     VX, VX, UNIT_X

-    add     TMP2, mem_operand, TMP2, asl #2

-    vld1.32 {d&reg1&[0]}, [TMP1, :32]

+    add     TMP2, \mem_operand, TMP2, asl #2

+    vld1.32 {d\()\reg1\()[0]}, [TMP1, :32]

     mov     TMP1, VX, asr #16

     add     VX, VX, UNIT_X, asl #1

-    add     TMP1, mem_operand, TMP1, asl #2

-    vld1.32 {d&reg2&[0]}, [TMP2, :32]

+    add     TMP1, \mem_operand, TMP1, asl #2

+    vld1.32 {d\()\reg2\()[0]}, [TMP2, :32]

     mov     TMP2, VX, asr #16

     add     VX, VX, UNIT_X

-    add     TMP2, mem_operand, TMP2, asl #2

-    vld1.32 {d&reg1&[1]}, [TMP1, :32]

-    vld1.32 {d&reg2&[1]}, [TMP2, :32]

+    add     TMP2, \mem_operand, TMP2, asl #2

+    vld1.32 {d\()\reg1\()[1]}, [TMP1, :32]

+    vld1.32 {d\()\reg2\()[1]}, [TMP2, :32]

 .else

-    pixld1_s elem_size, reg1, mem_operand

-    pixld1_s elem_size, reg2, mem_operand

+    pixld1_s \elem_size, \reg1, \mem_operand

+    pixld1_s \elem_size, \reg2, \mem_operand

 .endif

 .endm

 .macro pixld0_s elem_size, reg1, idx, mem_operand

-.if elem_size == 16

+.if \elem_size == 16

     mov     TMP1, VX, asr #16

     adds    VX, VX, UNIT_X

 5:  subpls  VX, VX, SRC_WIDTH_FIXED

     bpl     5b

-    add     TMP1, mem_operand, TMP1, asl #1

-    vld1.16 {d&reg1&[idx]}, [TMP1, :16]

-.elseif elem_size == 32

+    add     TMP1, \mem_operand, TMP1, asl #1

+    vld1.16 {d\()\reg1\()[\idx]}, [TMP1, :16]

+.elseif \elem_size == 32

     mov     TMP1, VX, asr #16

     adds    VX, VX, UNIT_X

 5:  subpls  VX, VX, SRC_WIDTH_FIXED

     bpl     5b

-    add     TMP1, mem_operand, TMP1, asl #2

-    vld1.32 {d&reg1&[idx]}, [TMP1, :32]

+    add     TMP1, \mem_operand, TMP1, asl #2

+    vld1.32 {d\()\reg1\()[\idx]}, [TMP1, :32]

 .endif

 .endm

 .macro pixld_s_internal numbytes, elem_size, basereg, mem_operand

-.if numbytes == 32

-    pixld2_s elem_size, %(basereg+4), %(basereg+5), mem_operand

-    pixld2_s elem_size, %(basereg+6), %(basereg+7), mem_operand

-    pixdeinterleave elem_size, %(basereg+4)

-.elseif numbytes == 16

-    pixld2_s elem_size, %(basereg+2), %(basereg+3), mem_operand

-.elseif numbytes == 8

-    pixld1_s elem_size, %(basereg+1), mem_operand

-.elseif numbytes == 4

-    .if elem_size == 32

-        pixld0_s elem_size, %(basereg+0), 1, mem_operand

-    .elseif elem_size == 16

-        pixld0_s elem_size, %(basereg+0), 2, mem_operand

-        pixld0_s elem_size, %(basereg+0), 3, mem_operand

+.if \numbytes == 32

+    pixld2_s \elem_size, %(\basereg+4), %(\basereg+5), \mem_operand

+    pixld2_s \elem_size, %(\basereg+6), %(\basereg+7), \mem_operand

+    pixdeinterleave \elem_size, %(\basereg+4)

+.elseif \numbytes == 16

+    pixld2_s \elem_size, %(\basereg+2), %(\basereg+3), \mem_operand

+.elseif \numbytes == 8

+    pixld1_s \elem_size, %(\basereg+1), \mem_operand

+.elseif \numbytes == 4

+    .if \elem_size == 32

+        pixld0_s \elem_size, %(\basereg+0), 1, \mem_operand

+    .elseif \elem_size == 16

+        pixld0_s \elem_size, %(\basereg+0), 2, \mem_operand

+        pixld0_s \elem_size, %(\basereg+0), 3, \mem_operand

     .else

-        pixld0_s elem_size, %(basereg+0), 4, mem_operand

-        pixld0_s elem_size, %(basereg+0), 5, mem_operand

-        pixld0_s elem_size, %(basereg+0), 6, mem_operand

-        pixld0_s elem_size, %(basereg+0), 7, mem_operand

+        pixld0_s \elem_size, %(\basereg+0), 4, \mem_operand

+        pixld0_s \elem_size, %(\basereg+0), 5, \mem_operand

+        pixld0_s \elem_size, %(\basereg+0), 6, \mem_operand

+        pixld0_s \elem_size, %(\basereg+0), 7, \mem_operand

     .endif

-.elseif numbytes == 2

-    .if elem_size == 16

-        pixld0_s elem_size, %(basereg+0), 1, mem_operand

+.elseif \numbytes == 2

+    .if \elem_size == 16

+        pixld0_s \elem_size, %(\basereg+0), 1, \mem_operand

     .else

-        pixld0_s elem_size, %(basereg+0), 2, mem_operand

-        pixld0_s elem_size, %(basereg+0), 3, mem_operand

+        pixld0_s \elem_size, %(\basereg+0), 2, \mem_operand

+        pixld0_s \elem_size, %(\basereg+0), 3, \mem_operand

     .endif

-.elseif numbytes == 1

-    pixld0_s elem_size, %(basereg+0), 1, mem_operand

+.elseif \numbytes == 1

+    pixld0_s \elem_size, %(\basereg+0), 1, \mem_operand

 .else

-    .error "unsupported size: numbytes"

+    .error "unsupported size: \numbytes"

 .endif

 .endm

 .macro pixld_s numpix, bpp, basereg, mem_operand

-.if bpp > 0

-    pixld_s_internal %(numpix * bpp / 8), %(bpp), basereg, mem_operand

+.if \bpp > 0

+    pixld_s_internal %(\numpix * \bpp / 8), %(\bpp), \basereg, \mem_operand

 .endif

 .endm

 .macro vuzp8 reg1, reg2

-    vuzp.8 d&reg1, d&reg2

+    vuzp.8 d\()\reg1, d\()\reg2

 .endm

 .macro vzip8 reg1, reg2

-    vzip.8 d&reg1, d&reg2

+    vzip.8 d\()\reg1, d\()\reg2

 .endm

 /* deinterleave B, G, R, A channels for eight 32bpp pixels in 4 registers */

 .macro pixdeinterleave bpp, basereg

-.if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0)

-    vuzp8 %(basereg+0), %(basereg+1)

-    vuzp8 %(basereg+2), %(basereg+3)

-    vuzp8 %(basereg+1), %(basereg+3)

-    vuzp8 %(basereg+0), %(basereg+2)

+.if (\bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0)

+    vuzp8 %(\basereg+0), %(\basereg+1)

+    vuzp8 %(\basereg+2), %(\basereg+3)

+    vuzp8 %(\basereg+1), %(\basereg+3)

+    vuzp8 %(\basereg+0), %(\basereg+2)

 .endif

 .endm

 /* interleave B, G, R, A channels for eight 32bpp pixels in 4 registers */

 .macro pixinterleave bpp, basereg

-.if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0)

-    vzip8 %(basereg+0), %(basereg+2)

-    vzip8 %(basereg+1), %(basereg+3)

-    vzip8 %(basereg+2), %(basereg+3)

-    vzip8 %(basereg+0), %(basereg+1)

+.if (\bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0)

+    vzip8 %(\basereg+0), %(\basereg+2)

+    vzip8 %(\basereg+1), %(\basereg+3)

+    vzip8 %(\basereg+2), %(\basereg+3)

+    vzip8 %(\basereg+0), %(\basereg+1)

 .endif

 .endm

/*

  * This is a macro for implementing cache preload. The main idea is that

  * cache preload logic is mostly independent from the rest of pixels

  * processing code. It starts at the top left pixel and moves forward

  * across pixels and can jump across scanlines. Prefetch distance is

@@ -389,51 +389,51 @@ 5:  subpls  VX, VX, SRC_WIDTH_FIXED

  * for almost zero cost!

  * (*) The overhead of the prefetcher is visible when running some trivial

  * pixels processing like simple copy. Anyway, having prefetch is a must

  * when working with the graphics data.

*/

 .macro PF a, x:vararg

 .if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_ADVANCED)

-    a x

+    \a \x

 .endif

 .endm

 .macro cache_preload std_increment, boost_increment

 .if (src_bpp_shift >= 0) || (dst_r_bpp != 0) || (mask_bpp_shift >= 0)

 .if regs_shortage

-    PF ldr ORIG_W, [sp] /* If we are short on regs, ORIG_W is kept on stack */

+    PF ldr, ORIG_W, [sp] /* If we are short on regs, ORIG_W is kept on stack */

 .endif

-.if std_increment != 0

-    PF add PF_X, PF_X, #std_increment

+.if \std_increment != 0

+    PF add, PF_X, PF_X, #\std_increment

 .endif

-    PF tst PF_CTL, #0xF

-    PF addne PF_X, PF_X, #boost_increment

-    PF subne PF_CTL, PF_CTL, #1

-    PF cmp PF_X, ORIG_W

+    PF tst, PF_CTL, #0xF

+    PF addne, PF_X, PF_X, #\boost_increment

+    PF subne, PF_CTL, PF_CTL, #1

+    PF cmp, PF_X, ORIG_W

 .if src_bpp_shift >= 0

     PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]

 .endif

 .if dst_r_bpp != 0

     PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]

 .endif

 .if mask_bpp_shift >= 0

     PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]

 .endif

-    PF subge PF_X, PF_X, ORIG_W

-    PF subges PF_CTL, PF_CTL, #0x10

+    PF subge, PF_X, PF_X, ORIG_W

+    PF subges, PF_CTL, PF_CTL, #0x10

 .if src_bpp_shift >= 0

-    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!

+    PF ldrgeb, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!

 .endif

 .if dst_r_bpp != 0

-    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!

+    PF ldrgeb, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!

 .endif

 .if mask_bpp_shift >= 0

-    PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!

+    PF ldrgeb, DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!

 .endif

 .endif

 .endm

 .macro cache_preload_simple

 .if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_SIMPLE)

 .if src_bpp > 0

     pld [SRC, #(PREFETCH_DISTANCE_SIMPLE * src_bpp / 8)]

@@ -460,51 +460,53 @@ 5:  subpls  VX, VX, SRC_WIDTH_FIXED

 .macro ensure_destination_ptr_alignment process_pixblock_head, \

                                         process_pixblock_tail, \

                                         process_pixblock_tail_head

 .if dst_w_bpp != 24

     tst         DST_R, #0xF

     beq         2f

 .irp lowbit, 1, 2, 4, 8, 16

+#ifndef __clang__

 local skip1

-.if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp))

-.if lowbit < 16 /* we don't need more than 16-byte alignment */

-    tst         DST_R, #lowbit

+#endif

+.if (dst_w_bpp <= (\lowbit * 8)) && ((\lowbit * 8) < (pixblock_size * dst_w_bpp))

+.if \lowbit < 16 /* we don't need more than 16-byte alignment */

+    tst         DST_R, #\lowbit

     beq         1f

 .endif

-    pixld_src   (lowbit * 8 / dst_w_bpp), src_bpp, src_basereg, SRC

-    pixld       (lowbit * 8 / dst_w_bpp), mask_bpp, mask_basereg, MASK

+    pixld_src   (\lowbit * 8 / dst_w_bpp), src_bpp, src_basereg, SRC

+    pixld       (\lowbit * 8 / dst_w_bpp), mask_bpp, mask_basereg, MASK

 .if dst_r_bpp > 0

-    pixld_a     (lowbit * 8 / dst_r_bpp), dst_r_bpp, dst_r_basereg, DST_R

+    pixld_a     (\lowbit * 8 / dst_r_bpp), dst_r_bpp, dst_r_basereg, DST_R

 .else

-    add         DST_R, DST_R, #lowbit

+    add         DST_R, DST_R, #\lowbit

 .endif

-    PF add      PF_X, PF_X, #(lowbit * 8 / dst_w_bpp)

-    sub         W, W, #(lowbit * 8 / dst_w_bpp)

+    PF add,     PF_X, PF_X, #(\lowbit * 8 / dst_w_bpp)

+    sub         W, W, #(\lowbit * 8 / dst_w_bpp)

1:

 .endif

 .endr

     pixdeinterleave src_bpp, src_basereg

     pixdeinterleave mask_bpp, mask_basereg

     pixdeinterleave dst_r_bpp, dst_r_basereg

-    process_pixblock_head

+    \process_pixblock_head

     cache_preload 0, pixblock_size

     cache_preload_simple

-    process_pixblock_tail

+    \process_pixblock_tail

     pixinterleave dst_w_bpp, dst_w_basereg

 .irp lowbit, 1, 2, 4, 8, 16

-.if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp))

-.if lowbit < 16 /* we don't need more than 16-byte alignment */

-    tst         DST_W, #lowbit

+.if (dst_w_bpp <= (\lowbit * 8)) && ((\lowbit * 8) < (pixblock_size * dst_w_bpp))

+.if \lowbit < 16 /* we don't need more than 16-byte alignment */

+    tst         DST_W, #\lowbit

     beq         1f

 .endif

-    pixst_a     (lowbit * 8 / dst_w_bpp), dst_w_bpp, dst_w_basereg, DST_W

+    pixst_a     (\lowbit * 8 / dst_w_bpp), dst_w_bpp, dst_w_basereg, DST_W

1:

 .endif

 .endr

 .endif

2:

 .endm

/*

@@ -525,51 +527,51 @@ 2:

 .macro process_trailing_pixels cache_preload_flag, \

                                dst_aligned_flag, \

                                process_pixblock_head, \

                                process_pixblock_tail, \

                                process_pixblock_tail_head

     tst         W, #(pixblock_size - 1)

     beq         2f

 .irp chunk_size, 16, 8, 4, 2, 1

-.if pixblock_size > chunk_size

-    tst         W, #chunk_size

+.if pixblock_size > \chunk_size

+    tst         W, #\chunk_size

     beq         1f

-    pixld_src   chunk_size, src_bpp, src_basereg, SRC

-    pixld       chunk_size, mask_bpp, mask_basereg, MASK

-.if dst_aligned_flag != 0

-    pixld_a     chunk_size, dst_r_bpp, dst_r_basereg, DST_R

+    pixld_src   \chunk_size, src_bpp, src_basereg, SRC

+    pixld       \chunk_size, mask_bpp, mask_basereg, MASK

+.if \dst_aligned_flag != 0

+    pixld_a     \chunk_size, dst_r_bpp, dst_r_basereg, DST_R

 .else

-    pixld       chunk_size, dst_r_bpp, dst_r_basereg, DST_R

+    pixld       \chunk_size, dst_r_bpp, dst_r_basereg, DST_R

 .endif

-.if cache_preload_flag != 0

-    PF add      PF_X, PF_X, #chunk_size

+.if \cache_preload_flag != 0

+    PF add,     PF_X, PF_X, #\chunk_size

 .endif

1:

 .endif

 .endr

     pixdeinterleave src_bpp, src_basereg

     pixdeinterleave mask_bpp, mask_basereg

     pixdeinterleave dst_r_bpp, dst_r_basereg

-    process_pixblock_head

-.if cache_preload_flag != 0

+    \process_pixblock_head

+.if \cache_preload_flag != 0

     cache_preload 0, pixblock_size

     cache_preload_simple

 .endif

-    process_pixblock_tail

+    \process_pixblock_tail

     pixinterleave dst_w_bpp, dst_w_basereg

 .irp chunk_size, 16, 8, 4, 2, 1

-.if pixblock_size > chunk_size

-    tst         W, #chunk_size

+.if pixblock_size > \chunk_size

+    tst         W, #\chunk_size

     beq         1f

-.if dst_aligned_flag != 0

-    pixst_a     chunk_size, dst_w_bpp, dst_w_basereg, DST_W

+.if \dst_aligned_flag != 0

+    pixst_a     \chunk_size, dst_w_bpp, dst_w_basereg, DST_W

 .else

-    pixst       chunk_size, dst_w_bpp, dst_w_basereg, DST_W

+    pixst       \chunk_size, dst_w_bpp, dst_w_basereg, DST_W

 .endif

1:

 .endif

 .endr

2:

 .endm

/*

@@ -599,17 +601,17 @@ 2:

 .if (mask_bpp != 24) && (mask_bpp != 0)

     sub         MASK, MASK, W, lsl #mask_bpp_shift

 .endif

     subs        H, H, #1

     mov         DST_R, DST_W

 .if regs_shortage

     str         H, [sp, #4] /* save updated height to stack */

 .endif

-    bge         start_of_loop_label

+    bge         \start_of_loop_label

 .endm

/*

  * Registers are allocated in the following way by default:

  * d0, d1, d2, d3     - reserved for loading source pixel data

  * d4, d5, d6, d7     - reserved for loading destination pixel data

  * d24, d25, d26, d27 - reserved for loading mask pixel data

  * d28, d29, d30, d31 - final destination pixel data for writeback to memory

@@ -626,48 +628,48 @@ 2:

                                    process_pixblock_head, \

                                    process_pixblock_tail, \

                                    process_pixblock_tail_head, \

                                    dst_w_basereg_ = 28, \

                                    dst_r_basereg_ = 4, \

                                    src_basereg_   = 0, \

                                    mask_basereg_  = 24

-    pixman_asm_function fname

+    pixman_asm_function \fname

     push        {r4-r12, lr}        /* save all registers */

/*

  * Select prefetch type for this function. If prefetch distance is

  * set to 0 or one of the color formats is 24bpp, SIMPLE prefetch

  * has to be used instead of ADVANCED.

*/

     .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_DEFAULT

-.if prefetch_distance == 0

+.if \prefetch_distance == 0

     .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE

 .elseif (PREFETCH_TYPE_CURRENT > PREFETCH_TYPE_SIMPLE) && \

-        ((src_bpp_ == 24) || (mask_bpp_ == 24) || (dst_w_bpp_ == 24))

+        ((\src_bpp_ == 24) || (\mask_bpp_ == 24) || (\dst_w_bpp_ == 24))

     .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_SIMPLE

 .endif

/*

  * Make some macro arguments globally visible and accessible

  * from other macros

*/

-    .set src_bpp, src_bpp_

-    .set mask_bpp, mask_bpp_

-    .set dst_w_bpp, dst_w_bpp_

-    .set pixblock_size, pixblock_size_

-    .set dst_w_basereg, dst_w_basereg_

-    .set dst_r_basereg, dst_r_basereg_

-    .set src_basereg, src_basereg_

-    .set mask_basereg, mask_basereg_

+    .set src_bpp, \src_bpp_

+    .set mask_bpp, \mask_bpp_

+    .set dst_w_bpp, \dst_w_bpp_

+    .set pixblock_size, \pixblock_size_

+    .set dst_w_basereg, \dst_w_basereg_

+    .set dst_r_basereg, \dst_r_basereg_

+    .set src_basereg, \src_basereg_

+    .set mask_basereg, \mask_basereg_

     .macro pixld_src x:vararg

-        pixld x

+        pixld \x

     .endm

     .macro fetch_src_pixblock

         pixld_src   pixblock_size, src_bpp, \

                     (src_basereg - pixblock_size * src_bpp / 64), SRC

     .endm

/*

  * Assign symbolic names to registers

*/

@@ -750,38 +752,38 @@ 2:

 .elseif dst_w_bpp == 16

     .set dst_bpp_shift, 1

 .elseif dst_w_bpp == 8

     .set dst_bpp_shift, 0

 .else

     .error "requested dst bpp (dst_w_bpp) is not supported"

 .endif

-.if (((flags) & FLAG_DST_READWRITE) != 0)

+.if (((\flags) & FLAG_DST_READWRITE) != 0)

     .set dst_r_bpp, dst_w_bpp

 .else

     .set dst_r_bpp, 0

 .endif

-.if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0)

+.if (((\flags) & FLAG_DEINTERLEAVE_32BPP) != 0)

     .set DEINTERLEAVE_32BPP_ENABLED, 1

 .else

     .set DEINTERLEAVE_32BPP_ENABLED, 0

 .endif

-.if prefetch_distance < 0 || prefetch_distance > 15

-    .error "invalid prefetch distance (prefetch_distance)"

+.if \prefetch_distance < 0 || \prefetch_distance > 15

+    .error "invalid prefetch distance (\prefetch_distance)"

 .endif

 .if src_bpp > 0

     ldr         SRC, [sp, #40]

 .endif

 .if mask_bpp > 0

     ldr         MASK, [sp, #48]

 .endif

-    PF mov      PF_X, #0

+    PF mov,     PF_X, #0

 .if src_bpp > 0

     ldr         SRC_STRIDE, [sp, #44]

 .endif

 .if mask_bpp > 0

     ldr         MASK_STRIDE, [sp, #52]

 .endif

     mov         DST_R, DST_W

@@ -796,24 +798,24 @@ 2:

 .if dst_w_bpp == 24

     sub         DST_STRIDE, DST_STRIDE, W

     sub         DST_STRIDE, DST_STRIDE, W, lsl #1

 .endif

/*

  * Setup advanced prefetcher initial state

*/

-    PF mov      PF_SRC, SRC

-    PF mov      PF_DST, DST_R

-    PF mov      PF_MASK, MASK

+    PF mov,     PF_SRC, SRC

+    PF mov,     PF_DST, DST_R

+    PF mov,     PF_MASK, MASK

     /* PF_CTL = prefetch_distance | ((h - 1) << 4) */

-    PF mov      PF_CTL, H, lsl #4

-    PF add      PF_CTL, #(prefetch_distance - 0x10)

+    PF mov,     PF_CTL, H, lsl #4

+    PF add,     PF_CTL, #(\prefetch_distance - 0x10)

-    init

+    \init

 .if regs_shortage

     push        {r0, r1}

 .endif

     subs        H, H, #1

 .if regs_shortage

     str         H, [sp, #4] /* save updated height to stack */

 .else

     mov         ORIG_W, W

@@ -821,84 +823,84 @@ 2:

     blt         9f

     cmp         W, #(pixblock_size * 2)

     blt         8f

/*

  * This is the start of the pipelined loop, which if optimized for

  * long scanlines

*/

0:

-    ensure_destination_ptr_alignment process_pixblock_head, \

-                                     process_pixblock_tail, \

-                                     process_pixblock_tail_head

+    ensure_destination_ptr_alignment \process_pixblock_head, \

+                                     \process_pixblock_tail, \

+                                     \process_pixblock_tail_head

     /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */

     pixld_a     pixblock_size, dst_r_bpp, \

                 (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R

     fetch_src_pixblock

     pixld       pixblock_size, mask_bpp, \

                 (mask_basereg - pixblock_size * mask_bpp / 64), MASK

-    PF add      PF_X, PF_X, #pixblock_size

-    process_pixblock_head

+    PF add,     PF_X, PF_X, #pixblock_size

+    \process_pixblock_head

     cache_preload 0, pixblock_size

     cache_preload_simple

     subs        W, W, #(pixblock_size * 2)

     blt         2f

1:

-    process_pixblock_tail_head

+    \process_pixblock_tail_head

     cache_preload_simple

     subs        W, W, #pixblock_size

     bge         1b

2:

-    process_pixblock_tail

+    \process_pixblock_tail

     pixst_a     pixblock_size, dst_w_bpp, \

                 (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W

     /* Process the remaining trailing pixels in the scanline */

     process_trailing_pixels 1, 1, \

-                            process_pixblock_head, \

-                            process_pixblock_tail, \

-                            process_pixblock_tail_head

+                            \process_pixblock_head, \

+                            \process_pixblock_tail, \

+                            \process_pixblock_tail_head

     advance_to_next_scanline 0b

 .if regs_shortage

     pop         {r0, r1}

 .endif

-    cleanup

+    \cleanup

     pop         {r4-r12, pc}  /* exit */

/*

  * This is the start of the loop, designed to process images with small width

  * (less than pixblock_size * 2 pixels). In this case neither pipelining

  * nor prefetch are used.

*/

8:

     /* Process exactly pixblock_size pixels if needed */

     tst         W, #pixblock_size

     beq         1f

     pixld       pixblock_size, dst_r_bpp, \

                 (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R

     fetch_src_pixblock

     pixld       pixblock_size, mask_bpp, \

                 (mask_basereg - pixblock_size * mask_bpp / 64), MASK

-    process_pixblock_head

-    process_pixblock_tail

+    \process_pixblock_head

+    \process_pixblock_tail

     pixst       pixblock_size, dst_w_bpp, \

                 (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W

1:

     /* Process the remaining trailing pixels in the scanline */

     process_trailing_pixels 0, 0, \

-                            process_pixblock_head, \

-                            process_pixblock_tail, \

-                            process_pixblock_tail_head

+                            \process_pixblock_head, \

+                            \process_pixblock_tail, \

+                            \process_pixblock_tail_head

     advance_to_next_scanline 8b

9:

 .if regs_shortage

     pop         {r0, r1}

 .endif

-    cleanup

+    \cleanup

     pop         {r4-r12, pc}  /* exit */

     .purgem     fetch_src_pixblock

     .purgem     pixld_src

     .unreq      SRC

     .unreq      MASK

     .unreq      DST_R

@@ -910,17 +912,17 @@ 9:

     .unreq      DST_STRIDE

     .unreq      MASK_STRIDE

     .unreq      PF_CTL

     .unreq      PF_X

     .unreq      PF_SRC

     .unreq      PF_DST

     .unreq      PF_MASK

     .unreq      DUMMY

-    .endfunc

+    pixman_end_asm_function

 .endm

/*

  * A simplified variant of function generation template for a single

  * scanline processing (for implementing pixman combine functions)

*/

 .macro generate_composite_function_scanline        use_nearest_scaling, \

                                                    fname, \

@@ -934,49 +936,49 @@ 9:

                                                    process_pixblock_head, \

                                                    process_pixblock_tail, \

                                                    process_pixblock_tail_head, \

                                                    dst_w_basereg_ = 28, \

                                                    dst_r_basereg_ = 4, \

                                                    src_basereg_   = 0, \

                                                    mask_basereg_  = 24

-    pixman_asm_function fname

+    pixman_asm_function \fname

     .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE

/*

  * Make some macro arguments globally visible and accessible

  * from other macros

*/

-    .set src_bpp, src_bpp_

-    .set mask_bpp, mask_bpp_

-    .set dst_w_bpp, dst_w_bpp_

-    .set pixblock_size, pixblock_size_

-    .set dst_w_basereg, dst_w_basereg_

-    .set dst_r_basereg, dst_r_basereg_

-    .set src_basereg, src_basereg_

-    .set mask_basereg, mask_basereg_

+    .set src_bpp, \src_bpp_

+    .set mask_bpp, \mask_bpp_

+    .set dst_w_bpp, \dst_w_bpp_

+    .set pixblock_size, \pixblock_size_

+    .set dst_w_basereg, \dst_w_basereg_

+    .set dst_r_basereg, \dst_r_basereg_

+    .set src_basereg, \src_basereg_

+    .set mask_basereg, \mask_basereg_

-.if use_nearest_scaling != 0

+.if \use_nearest_scaling != 0

/*

      * Assign symbolic names to registers for nearest scaling

*/

     W           .req        r0

     DST_W       .req        r1

     SRC         .req        r2

     VX          .req        r3

     UNIT_X      .req        ip

     MASK        .req        lr

     TMP1        .req        r4

     TMP2        .req        r5

     DST_R       .req        r6

     SRC_WIDTH_FIXED .req        r7

     .macro pixld_src x:vararg

-        pixld_s x

+        pixld_s \x

     .endm

     ldr         UNIT_X, [sp]

     push        {r4-r8, lr}

     ldr         SRC_WIDTH_FIXED, [sp, #(24 + 4)]

     .if mask_bpp != 0

     ldr         MASK, [sp, #(24 + 8)]

     .endif

@@ -986,89 +988,89 @@ 9:

*/

     W           .req        r0      /* width (is updated during processing) */

     DST_W       .req        r1      /* destination buffer pointer for writes */

     SRC         .req        r2      /* source buffer pointer */

     DST_R       .req        ip      /* destination buffer pointer for reads */

     MASK        .req        r3      /* mask pointer */

     .macro pixld_src x:vararg

-        pixld x

+        pixld \x

     .endm

 .endif

-.if (((flags) & FLAG_DST_READWRITE) != 0)

+.if (((\flags) & FLAG_DST_READWRITE) != 0)

     .set dst_r_bpp, dst_w_bpp

 .else

     .set dst_r_bpp, 0

 .endif

-.if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0)

+.if (((\flags) & FLAG_DEINTERLEAVE_32BPP) != 0)

     .set DEINTERLEAVE_32BPP_ENABLED, 1

 .else

     .set DEINTERLEAVE_32BPP_ENABLED, 0

 .endif

     .macro fetch_src_pixblock

         pixld_src   pixblock_size, src_bpp, \

                     (src_basereg - pixblock_size * src_bpp / 64), SRC

     .endm

-    init

+    \init

     mov         DST_R, DST_W

     cmp         W, #pixblock_size

     blt         8f

-    ensure_destination_ptr_alignment process_pixblock_head, \

-                                     process_pixblock_tail, \

-                                     process_pixblock_tail_head

+    ensure_destination_ptr_alignment \process_pixblock_head, \

+                                     \process_pixblock_tail, \

+                                     \process_pixblock_tail_head

     subs        W, W, #pixblock_size

     blt         7f

     /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */

     pixld_a     pixblock_size, dst_r_bpp, \

                 (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R

     fetch_src_pixblock

     pixld       pixblock_size, mask_bpp, \

                 (mask_basereg - pixblock_size * mask_bpp / 64), MASK

-    process_pixblock_head

+    \process_pixblock_head

     subs        W, W, #pixblock_size

     blt         2f

1:

-    process_pixblock_tail_head

+    \process_pixblock_tail_head

     subs        W, W, #pixblock_size

     bge         1b

2:

-    process_pixblock_tail

+    \process_pixblock_tail

     pixst_a     pixblock_size, dst_w_bpp, \

                 (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W

7:

     /* Process the remaining trailing pixels in the scanline (dst aligned) */

     process_trailing_pixels 0, 1, \

-                            process_pixblock_head, \

-                            process_pixblock_tail, \

-                            process_pixblock_tail_head

+                            \process_pixblock_head, \

+                            \process_pixblock_tail, \

+                            \process_pixblock_tail_head

-    cleanup

-.if use_nearest_scaling != 0

+    \cleanup

+.if \use_nearest_scaling != 0

     pop         {r4-r8, pc}  /* exit */

 .else

     bx          lr  /* exit */

 .endif

8:

     /* Process the remaining trailing pixels in the scanline (dst unaligned) */

     process_trailing_pixels 0, 0, \

-                            process_pixblock_head, \

-                            process_pixblock_tail, \

-                            process_pixblock_tail_head

+                            \process_pixblock_head, \

+                            \process_pixblock_tail, \

+                            \process_pixblock_tail_head

-    cleanup

+    \cleanup

-.if use_nearest_scaling != 0

+.if \use_nearest_scaling != 0

     pop         {r4-r8, pc}  /* exit */

     .unreq      DST_R

     .unreq      SRC

     .unreq      W

     .unreq      VX

     .unreq      UNIT_X

     .unreq      TMP1

@@ -1085,25 +1087,25 @@ 8:

     .unreq      DST_R

     .unreq      DST_W

     .unreq      W

 .endif

     .purgem     fetch_src_pixblock

     .purgem     pixld_src

-    .endfunc

+    pixman_end_asm_function

 .endm

 .macro generate_composite_function_single_scanline x:vararg

-    generate_composite_function_scanline 0, x

+    generate_composite_function_scanline 0, \x

 .endm

 .macro generate_composite_function_nearest_scanline x:vararg

-    generate_composite_function_scanline 1, x

+    generate_composite_function_scanline 1, \x

 .endm

 /* Default prologue/epilogue, nothing special needs to be done */

 .macro default_init

 .endm

 .macro default_cleanup

@@ -1129,56 +1131,56 @@ 8:

  * Conversion of 8 r5g6b6 pixels packed in 128-bit register (in)

  * into a planar a8r8g8b8 format (with a, r, g, b color components

  * stored into 64-bit registers out_a, out_r, out_g, out_b respectively).

  * Warning: the conversion is destructive and the original

  *          value (in) is lost.

*/

 .macro convert_0565_to_8888 in, out_a, out_r, out_g, out_b

-    vshrn.u16   out_r, in,    #8

-    vshrn.u16   out_g, in,    #3

-    vsli.u16    in,    in,    #5

-    vmov.u8     out_a, #255

-    vsri.u8     out_r, out_r, #5

-    vsri.u8     out_g, out_g, #6

-    vshrn.u16   out_b, in,    #2

+    vshrn.u16   \out_r, \in,    #8

+    vshrn.u16   \out_g, \in,    #3

+    vsli.u16    \in,    \in,    #5

+    vmov.u8     \out_a, #255

+    vsri.u8     \out_r, \out_r, #5

+    vsri.u8     \out_g, \out_g, #6

+    vshrn.u16   \out_b, \in,    #2

 .endm

 .macro convert_0565_to_x888 in, out_r, out_g, out_b

-    vshrn.u16   out_r, in,    #8

-    vshrn.u16   out_g, in,    #3

-    vsli.u16    in,    in,    #5

-    vsri.u8     out_r, out_r, #5

-    vsri.u8     out_g, out_g, #6

-    vshrn.u16   out_b, in,    #2

+    vshrn.u16   \out_r, \in,    #8

+    vshrn.u16   \out_g, \in,    #3

+    vsli.u16    \in,    \in,    #5

+    vsri.u8     \out_r, \out_r, #5

+    vsri.u8     \out_g, \out_g, #6

+    vshrn.u16   \out_b, \in,    #2

 .endm

/*

  * Conversion from planar a8r8g8b8 format (with a, r, g, b color components

  * in 64-bit registers in_a, in_r, in_g, in_b respectively) into 8 r5g6b6

  * pixels packed in 128-bit register (out). Requires two temporary 128-bit

  * registers (tmp1, tmp2)

*/

 .macro convert_8888_to_0565 in_r, in_g, in_b, out, tmp1, tmp2

-    vshll.u8    tmp1, in_g, #8

-    vshll.u8    out, in_r, #8

-    vshll.u8    tmp2, in_b, #8

-    vsri.u16    out, tmp1, #5

-    vsri.u16    out, tmp2, #11

+    vshll.u8    \tmp1, \in_g, #8

+    vshll.u8    \out, \in_r, #8

+    vshll.u8    \tmp2, \in_b, #8

+    vsri.u16    \out, \tmp1, #5

+    vsri.u16    \out, \tmp2, #11

 .endm

/*

  * Conversion of four r5g6b5 pixels (in) to four x8r8g8b8 pixels

  * returned in (out0, out1) registers pair. Requires one temporary

  * 64-bit register (tmp). 'out1' and 'in' may overlap, the original

  * value from 'in' is lost

*/

 .macro convert_four_0565_to_x888_packed in, out0, out1, tmp

-    vshl.u16    out0, in,   #5  /* G top 6 bits */

-    vshl.u16    tmp,  in,   #11 /* B top 5 bits */

-    vsri.u16    in,   in,   #5  /* R is ready in top bits */

-    vsri.u16    out0, out0, #6  /* G is ready in top bits */

-    vsri.u16    tmp,  tmp,  #5  /* B is ready in top bits */

-    vshr.u16    out1, in,   #8  /* R is in place */

-    vsri.u16    out0, tmp,  #8  /* G & B is in place */

-    vzip.u16    out0, out1      /* everything is in place */

+    vshl.u16    \out0, \in,   #5  /* G top 6 bits */

+    vshl.u16    \tmp,  \in,   #11 /* B top 5 bits */

+    vsri.u16    \in,   \in,   #5  /* R is ready in top bits */

+    vsri.u16    \out0, \out0, #6  /* G is ready in top bits */

+    vsri.u16    \tmp,  \tmp,  #5  /* B is ready in top bits */

+    vshr.u16    \out1, \in,   #8  /* R is in place */

+    vsri.u16    \out0, \tmp,  #8  /* G & B is in place */

+    vzip.u16    \out0, \out1      /* everything is in place */

 .endm

diff --git a/gfx/cairo/libpixman/src/pixman-arm-simd-asm-scaled.S b/gfx/cairo/libpixman/src/pixman-arm-simd-asm-scaled.S

--- a/gfx/cairo/libpixman/src/pixman-arm-simd-asm-scaled.S

+++ b/gfx/cairo/libpixman/src/pixman-arm-simd-asm-scaled.S

@@ -20,16 +20,20 @@

  * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING

  * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS

  * SOFTWARE.

  * Author:  Jeff Muizelaar (jeff@infidigm.net)

*/

+#ifdef __clang__

+#define subpls subspl

+#endif

 /* Prevent the stack from becoming executable */

 #if defined(__linux__) && defined(__ELF__)

 .section .note.GNU-stack,"",%progbits

 #endif

 	.text

 	.arch armv6

 	.object_arch armv4

@@ -57,100 +61,105 @@

  *  prefetch_braking_distance - stop prefetching when that many pixels are

  *                              remaining before the end of scanline

*/

 .macro generate_nearest_scanline_func fname, bpp_shift, t,      \

                                       prefetch_distance,        \

                                       prefetch_braking_distance

-pixman_asm_function fname

+pixman_asm_function \fname

 	W		.req	r0

 	DST		.req	r1

 	SRC		.req	r2

 	VX		.req	r3

 	UNIT_X		.req	ip

 	TMP1		.req	r4

 	TMP2		.req	r5

 	VXMASK		.req	r6

 	PF_OFFS		.req	r7

 	SRC_WIDTH_FIXED	.req	r8

 	ldr	UNIT_X, [sp]

 	push	{r4, r5, r6, r7, r8, r10}

-	mvn	VXMASK, #((1 << bpp_shift) - 1)

+	mvn	VXMASK, #((1 << \bpp_shift) - 1)

 	ldr	SRC_WIDTH_FIXED, [sp, #28]

 	/* define helper macro */

 	.macro	scale_2_pixels

-		ldr&t	TMP1, [SRC, TMP1]

-		and	TMP2, VXMASK, VX, asr #(16 - bpp_shift)

+		ldr\()\t	TMP1, [SRC, TMP1]

+		and	TMP2, VXMASK, VX, asr #(16 - \bpp_shift)

 		adds	VX, VX, UNIT_X

-		str&t	TMP1, [DST], #(1 << bpp_shift)

+		str\()\t	TMP1, [DST], #(1 << \bpp_shift)

 9:		subpls	VX, VX, SRC_WIDTH_FIXED

 		bpl	9b

-		ldr&t	TMP2, [SRC, TMP2]

-		and	TMP1, VXMASK, VX, asr #(16 - bpp_shift)

+		ldr\()\t	TMP2, [SRC, TMP2]

+		and	TMP1, VXMASK, VX, asr #(16 - \bpp_shift)

 		adds	VX, VX, UNIT_X

-		str&t	TMP2, [DST], #(1 << bpp_shift)

+		str\()\t	TMP2, [DST], #(1 << \bpp_shift)

 9:		subpls	VX, VX, SRC_WIDTH_FIXED

 		bpl	9b

 	.endm

 	/* now do the scaling */

-	and	TMP1, VXMASK, VX, asr #(16 - bpp_shift)

+	and	TMP1, VXMASK, VX, asr #(16 - \bpp_shift)

 	adds	VX, VX, UNIT_X

 9:	subpls	VX, VX, SRC_WIDTH_FIXED

 	bpl	9b

-	subs	W, W, #(8 + prefetch_braking_distance)

+	subs	W, W, #(8 + \prefetch_braking_distance)

 	blt	2f

 	/* calculate prefetch offset */

-	mov	PF_OFFS, #prefetch_distance

+	mov	PF_OFFS, #\prefetch_distance

 	mla	PF_OFFS, UNIT_X, PF_OFFS, VX

 1:	/* main loop, process 8 pixels per iteration with prefetch */

-	pld	[SRC, PF_OFFS, asr #(16 - bpp_shift)]

+	pld	[SRC, PF_OFFS, asr #(16 - \bpp_shift)]

 	add	PF_OFFS, UNIT_X, lsl #3

 	scale_2_pixels

 	scale_2_pixels

 	scale_2_pixels

 	scale_2_pixels

 	subs	W, W, #8

 	bge	1b

2:

-	subs	W, W, #(4 - 8 - prefetch_braking_distance)

+	subs	W, W, #(4 - 8 - \prefetch_braking_distance)

 	blt	2f

 1:	/* process the remaining pixels */

 	scale_2_pixels

 	scale_2_pixels

 	subs	W, W, #4

 	bge	1b

2:

 	tst	W, #2

 	beq	2f

 	scale_2_pixels

2:

 	tst	W, #1

-	ldrne&t	TMP1, [SRC, TMP1]

-	strne&t	TMP1, [DST]

+#ifdef __clang__

+	ldr\()\t\()ne	TMP1, [SRC, TMP1]

+	str\()\t\()ne	TMP1, [DST]

+#else

+	ldrne\()\t	TMP1, [SRC, TMP1]

+	strne\()\t	TMP1, [DST]

+#endif

 	/* cleanup helper macro */

 	.purgem	scale_2_pixels

 	.unreq	DST

 	.unreq	SRC

 	.unreq	W

 	.unreq	VX

 	.unreq	UNIT_X

 	.unreq	TMP1

 	.unreq	TMP2

 	.unreq	VXMASK

 	.unreq	PF_OFFS

 	.unreq  SRC_WIDTH_FIXED

 	/* return */

 	pop	{r4, r5, r6, r7, r8, r10}

 	bx	lr

-.endfunc

+pixman_end_asm_function

 .endm

 generate_nearest_scanline_func \

     pixman_scaled_nearest_scanline_0565_0565_SRC_asm_armv6, 1, h, 80, 32

 generate_nearest_scanline_func \

     pixman_scaled_nearest_scanline_8888_8888_SRC_asm_armv6, 2,  , 48, 32

diff --git a/gfx/cairo/libpixman/src/pixman-arm-simd-asm.S b/gfx/cairo/libpixman/src/pixman-arm-simd-asm.S

--- a/gfx/cairo/libpixman/src/pixman-arm-simd-asm.S

+++ b/gfx/cairo/libpixman/src/pixman-arm-simd-asm.S

@@ -20,16 +20,21 @@

  * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING

  * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS

  * SOFTWARE.

  * Author:  Ben Avison (bavison@riscosopen.org)

*/

+#ifdef __clang__

+#define adceqs adcseq

+#define ldmnedb ldmdbne

+#endif

 /* Prevent the stack from becoming executable */

 #if defined(__linux__) && defined(__ELF__)

 .section .note.GNU-stack,"",%progbits

 #endif

 	.text

 	.arch armv6

 	.object_arch armv4

@@ -52,26 +57,26 @@

  *   preload        If outputting 16 bytes causes 64 bytes to be read, whether an extra preload should be output

*/

 .macro blit_init

         line_saved_regs STRIDE_D, STRIDE_S

 .endm

 .macro blit_process_head   cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload

-        pixld   cond, numbytes, firstreg, SRC, unaligned_src

+        pixld   \cond, \numbytes, \firstreg, SRC, \unaligned_src

 .endm

 .macro blit_inner_loop  process_head, process_tail, unaligned_src, unaligned_mask, dst_alignment

     WK4     .req    STRIDE_D

     WK5     .req    STRIDE_S

     WK6     .req    MASK

     WK7     .req    STRIDE_M

-110:    pixld   , 16, 0, SRC, unaligned_src

-        pixld   , 16, 4, SRC, unaligned_src

+110:    pixld   , 16, 0, SRC, \unaligned_src

+        pixld   , 16, 4, SRC, \unaligned_src

         pld     [SRC, SCRATCH]

         pixst   , 16, 0, DST

         pixst   , 16, 4, DST

         subs    X, X, #32*8/src_bpp

         bhs     110b

     .unreq  WK4

     .unreq  WK5

     .unreq  WK6

@@ -137,17 +142,17 @@ generate_composite_function \

         mov     STRIDE_M, SRC

 .endm

 .macro fill_process_tail  cond, numbytes, firstreg

     WK4     .req    SRC

     WK5     .req    STRIDE_S

     WK6     .req    MASK

     WK7     .req    STRIDE_M

-        pixst   cond, numbytes, 4, DST

+        pixst   \cond, \numbytes, 4, DST

     .unreq  WK4

     .unreq  WK5

     .unreq  WK6

     .unreq  WK7

 .endm

 generate_composite_function \

     pixman_composite_src_n_8888_asm_armv6, 0, 0, 32, \

@@ -177,30 +182,30 @@ generate_composite_function \

     nop_macro, /* newline */ \

     nop_macro /* cleanup */ \

     nop_macro /* process head */ \

     fill_process_tail

 /******************************************************************************/

 .macro src_x888_8888_pixel, cond, reg

-        orr&cond WK&reg, WK&reg, #0xFF000000

+        orr\()\cond WK\()\reg, WK\()\reg, #0xFF000000

 .endm

 .macro pixman_composite_src_x888_8888_process_head   cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload

-        pixld   cond, numbytes, firstreg, SRC, unaligned_src

+        pixld   \cond, \numbytes, \firstreg, SRC, \unaligned_src

 .endm

 .macro pixman_composite_src_x888_8888_process_tail   cond, numbytes, firstreg

-        src_x888_8888_pixel cond, %(firstreg+0)

- .if numbytes >= 8

-        src_x888_8888_pixel cond, %(firstreg+1)

-  .if numbytes == 16

-        src_x888_8888_pixel cond, %(firstreg+2)

-        src_x888_8888_pixel cond, %(firstreg+3)

+        src_x888_8888_pixel \cond, %(\firstreg+0)

+ .if \numbytes >= 8

+        src_x888_8888_pixel \cond, %(\firstreg+1)

+  .if \numbytes == 16

+        src_x888_8888_pixel \cond, %(\firstreg+2)

+        src_x888_8888_pixel \cond, %(\firstreg+3)

   .endif

  .endif

 .endm

 generate_composite_function \

     pixman_composite_src_x888_8888_asm_armv6, 32, 0, 32, \

     FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_SCRATCH, \

     3, /* prefetch distance */ \

@@ -217,83 +222,83 @@ generate_composite_function \

         ldr     MASK, =0x07E007E0

         mov     STRIDE_M, #0xFF000000

         /* Set GE[3:0] to 1010 so SEL instructions do what we want */

         ldr     SCRATCH, =0x80008000

         uadd8   SCRATCH, SCRATCH, SCRATCH

 .endm

 .macro src_0565_8888_2pixels, reg1, reg2

-        and     SCRATCH, WK&reg1, MASK             @ 00000GGGGGG0000000000gggggg00000

-        bic     WK&reg2, WK&reg1, MASK             @ RRRRR000000BBBBBrrrrr000000bbbbb

-        orr     SCRATCH, SCRATCH, SCRATCH, lsr #6  @ 00000GGGGGGGGGGGG0000ggggggggggg

-        mov     WK&reg1, WK&reg2, lsl #16          @ rrrrr000000bbbbb0000000000000000

-        mov     SCRATCH, SCRATCH, ror #19          @ GGGG0000ggggggggggg00000GGGGGGGG

-        bic     WK&reg2, WK&reg2, WK&reg1, lsr #16 @ RRRRR000000BBBBB0000000000000000

-        orr     WK&reg1, WK&reg1, WK&reg1, lsr #5  @ rrrrrrrrrr0bbbbbbbbbb00000000000

-        orr     WK&reg2, WK&reg2, WK&reg2, lsr #5  @ RRRRRRRRRR0BBBBBBBBBB00000000000

-        pkhtb   WK&reg1, WK&reg1, WK&reg1, asr #5  @ rrrrrrrr--------bbbbbbbb--------

-        sel     WK&reg1, WK&reg1, SCRATCH          @ rrrrrrrrggggggggbbbbbbbb--------

-        mov     SCRATCH, SCRATCH, ror #16          @ ggg00000GGGGGGGGGGGG0000gggggggg

-        pkhtb   WK&reg2, WK&reg2, WK&reg2, asr #5  @ RRRRRRRR--------BBBBBBBB--------

-        sel     WK&reg2, WK&reg2, SCRATCH          @ RRRRRRRRGGGGGGGGBBBBBBBB--------

-        orr     WK&reg1, STRIDE_M, WK&reg1, lsr #8 @ 11111111rrrrrrrrggggggggbbbbbbbb

-        orr     WK&reg2, STRIDE_M, WK&reg2, lsr #8 @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB

+        and     SCRATCH, WK\()\reg1, MASK                   @ 00000GGGGGG0000000000gggggg00000

+        bic     WK\()\reg2, WK\()\reg1, MASK                @ RRRRR000000BBBBBrrrrr000000bbbbb

+        orr     SCRATCH, SCRATCH, SCRATCH, lsr #6           @ 00000GGGGGGGGGGGG0000ggggggggggg

+        mov     WK\()\reg1, WK\()\reg2, lsl #16             @ rrrrr000000bbbbb0000000000000000

+        mov     SCRATCH, SCRATCH, ror #19                   @ GGGG0000ggggggggggg00000GGGGGGGG

+        bic     WK\()\reg2, WK\()\reg2, WK\()\reg1, lsr #16 @ RRRRR000000BBBBB0000000000000000

+        orr     WK\()\reg1, WK\()\reg1, WK\()\reg1, lsr #5  @ rrrrrrrrrr0bbbbbbbbbb00000000000

+        orr     WK\()\reg2, WK\()\reg2, WK\()\reg2, lsr #5  @ RRRRRRRRRR0BBBBBBBBBB00000000000

+        pkhtb   WK\()\reg1, WK\()\reg1, WK\()\reg1, asr #5  @ rrrrrrrr--------bbbbbbbb--------

+        sel     WK\()\reg1, WK\()\reg1, SCRATCH             @ rrrrrrrrggggggggbbbbbbbb--------

+        mov     SCRATCH, SCRATCH, ror #16                   @ ggg00000GGGGGGGGGGGG0000gggggggg

+        pkhtb   WK\()\reg2, WK\()\reg2, WK\()\reg2, asr #5  @ RRRRRRRR--------BBBBBBBB--------

+        sel     WK\()\reg2, WK\()\reg2, SCRATCH             @ RRRRRRRRGGGGGGGGBBBBBBBB--------

+        orr     WK\()\reg1, STRIDE_M, WK\()\reg1, lsr #8    @ 11111111rrrrrrrrggggggggbbbbbbbb

+        orr     WK\()\reg2, STRIDE_M, WK\()\reg2, lsr #8    @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB

 .endm

 /* This version doesn't need STRIDE_M, but is one instruction longer.

    It would however be preferable for an XRGB target, since we could knock off the last 2 instructions, but is that a common case?

-        and     SCRATCH, WK&reg1, MASK             @ 00000GGGGGG0000000000gggggg00000

-        bic     WK&reg1, WK&reg1, MASK             @ RRRRR000000BBBBBrrrrr000000bbbbb

-        orr     SCRATCH, SCRATCH, SCRATCH, lsr #6  @ 00000GGGGGGGGGGGG0000ggggggggggg

-        mov     WK&reg2, WK&reg1, lsr #16          @ 0000000000000000RRRRR000000BBBBB

-        mov     SCRATCH, SCRATCH, ror #27          @ GGGGGGGGGGGG0000ggggggggggg00000

-        bic     WK&reg1, WK&reg1, WK&reg2, lsl #16 @ 0000000000000000rrrrr000000bbbbb

-        mov     WK&reg2, WK&reg2, lsl #3           @ 0000000000000RRRRR000000BBBBB000

-        mov     WK&reg1, WK&reg1, lsl #3           @ 0000000000000rrrrr000000bbbbb000

-        orr     WK&reg2, WK&reg2, WK&reg2, lsr #5  @ 0000000000000RRRRRRRRRR0BBBBBBBB

-        orr     WK&reg1, WK&reg1, WK&reg1, lsr #5  @ 0000000000000rrrrrrrrrr0bbbbbbbb

-        pkhbt   WK&reg2, WK&reg2, WK&reg2, lsl #5  @ --------RRRRRRRR--------BBBBBBBB

-        pkhbt   WK&reg1, WK&reg1, WK&reg1, lsl #5  @ --------rrrrrrrr--------bbbbbbbb

-        sel     WK&reg2, SCRATCH, WK&reg2          @ --------RRRRRRRRGGGGGGGGBBBBBBBB

-        sel     WK&reg1, SCRATCH, WK&reg1          @ --------rrrrrrrrggggggggbbbbbbbb

-        orr     WK&reg2, WK&reg2, #0xFF000000      @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB

-        orr     WK&reg1, WK&reg1, #0xFF000000      @ 11111111rrrrrrrrggggggggbbbbbbbb

+        and     SCRATCH, WK\()\reg1, MASK                   @ 00000GGGGGG0000000000gggggg00000

+        bic     WK\()\reg1, WK\()\reg1, MASK                @ RRRRR000000BBBBBrrrrr000000bbbbb

+        orr     SCRATCH, SCRATCH, SCRATCH, lsr #6           @ 00000GGGGGGGGGGGG0000ggggggggggg

+        mov     WK\()\reg2, WK\()\reg1, lsr #16             @ 0000000000000000RRRRR000000BBBBB

+        mov     SCRATCH, SCRATCH, ror #27                   @ GGGGGGGGGGGG0000ggggggggggg00000

+        bic     WK\()\reg1, WK\()\reg1, WK\()\reg2, lsl #16 @ 0000000000000000rrrrr000000bbbbb

+        mov     WK\()\reg2, WK\()\reg2, lsl #3              @ 0000000000000RRRRR000000BBBBB000

+        mov     WK\()\reg1, WK\()\reg1, lsl #3              @ 0000000000000rrrrr000000bbbbb000

+        orr     WK\()\reg2, WK\()\reg2, WK\()\reg2, lsr #5  @ 0000000000000RRRRRRRRRR0BBBBBBBB

+        orr     WK\()\reg1, WK\()\reg1, WK\()\reg1, lsr #5  @ 0000000000000rrrrrrrrrr0bbbbbbbb

+        pkhbt   WK\()\reg2, WK\()\reg2, WK\()\reg2, lsl #5  @ --------RRRRRRRR--------BBBBBBBB

+        pkhbt   WK\()\reg1, WK\()\reg1, WK\()\reg1, lsl #5  @ --------rrrrrrrr--------bbbbbbbb

+        sel     WK\()\reg2, SCRATCH, WK\()\reg2             @ --------RRRRRRRRGGGGGGGGBBBBBBBB

+        sel     WK\()\reg1, SCRATCH, WK\()\reg1             @ --------rrrrrrrrggggggggbbbbbbbb

+        orr     WK\()\reg2, WK\()\reg2, #0xFF000000         @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB

+        orr     WK\()\reg1, WK\()\reg1, #0xFF000000         @ 11111111rrrrrrrrggggggggbbbbbbbb

*/

 .macro src_0565_8888_1pixel, reg

-        bic     SCRATCH, WK&reg, MASK              @ 0000000000000000rrrrr000000bbbbb

-        and     WK&reg, WK&reg, MASK               @ 000000000000000000000gggggg00000

-        mov     SCRATCH, SCRATCH, lsl #3           @ 0000000000000rrrrr000000bbbbb000

-        mov     WK&reg, WK&reg, lsl #5             @ 0000000000000000gggggg0000000000

-        orr     SCRATCH, SCRATCH, SCRATCH, lsr #5  @ 0000000000000rrrrrrrrrr0bbbbbbbb

-        orr     WK&reg, WK&reg, WK&reg, lsr #6     @ 000000000000000gggggggggggg00000

-        pkhbt   SCRATCH, SCRATCH, SCRATCH, lsl #5  @ --------rrrrrrrr--------bbbbbbbb

-        sel     WK&reg, WK&reg, SCRATCH            @ --------rrrrrrrrggggggggbbbbbbbb

-        orr     WK&reg, WK&reg, #0xFF000000        @ 11111111rrrrrrrrggggggggbbbbbbbb

+        bic     SCRATCH, WK\()\reg, MASK                 @ 0000000000000000rrrrr000000bbbbb

+        and     WK\()\reg, WK\()\reg, MASK               @ 000000000000000000000gggggg00000

+        mov     SCRATCH, SCRATCH, lsl #3                 @ 0000000000000rrrrr000000bbbbb000

+        mov     WK\()\reg, WK\()\reg, lsl #5             @ 0000000000000000gggggg0000000000

+        orr     SCRATCH, SCRATCH, SCRATCH, lsr #5        @ 0000000000000rrrrrrrrrr0bbbbbbbb

+        orr     WK\()\reg, WK\()\reg, WK\()\reg, lsr #6  @ 000000000000000gggggggggggg00000

+        pkhbt   SCRATCH, SCRATCH, SCRATCH, lsl #5        @ --------rrrrrrrr--------bbbbbbbb

+        sel     WK\()\reg, WK\()\reg, SCRATCH            @ --------rrrrrrrrggggggggbbbbbbbb

+        orr     WK\()\reg, WK\()\reg, #0xFF000000        @ 11111111rrrrrrrrggggggggbbbbbbbb

 .endm

 .macro src_0565_8888_process_head   cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload

- .if numbytes == 16

-        pixldst ld,, 8, firstreg, %(firstreg+2),,, SRC, unaligned_src

- .elseif numbytes == 8

-        pixld   , 4, firstreg, SRC, unaligned_src

- .elseif numbytes == 4

-        pixld   , 2, firstreg, SRC, unaligned_src

+ .if \numbytes == 16

+        pixldst ld,, 8, \firstreg, %(\firstreg+2),,, SRC, \unaligned_src

+ .elseif \numbytes == 8

+        pixld   , 4, \firstreg, SRC, \unaligned_src

+ .elseif \numbytes == 4

+        pixld   , 2, \firstreg, SRC, \unaligned_src

  .endif

 .endm

 .macro src_0565_8888_process_tail   cond, numbytes, firstreg

- .if numbytes == 16

-        src_0565_8888_2pixels firstreg, %(firstreg+1)

-        src_0565_8888_2pixels %(firstreg+2), %(firstreg+3)

- .elseif numbytes == 8

-        src_0565_8888_2pixels firstreg, %(firstreg+1)

+ .if \numbytes == 16

+        src_0565_8888_2pixels \firstreg, %(\firstreg+1)

+        src_0565_8888_2pixels %(\firstreg+2), %(\firstreg+3)

+ .elseif \numbytes == 8

+        src_0565_8888_2pixels \firstreg, %(\firstreg+1)

  .else

-        src_0565_8888_1pixel firstreg

+        src_0565_8888_1pixel \firstreg

  .endif

 .endm

 generate_composite_function \

     pixman_composite_src_0565_8888_asm_armv6, 16, 0, 32, \

     FLAG_DST_WRITEONLY | FLAG_BRANCH_OVER, \

     3, /* prefetch distance */ \

     src_0565_8888_init, \

@@ -306,67 +311,67 @@ generate_composite_function \

 .macro src_x888_0565_init

         /* Hold loop invariant in MASK */

         ldr     MASK, =0x001F001F

         line_saved_regs  STRIDE_S, ORIG_W

 .endm

 .macro src_x888_0565_1pixel  s, d

-        and     WK&d, MASK, WK&s, lsr #3           @ 00000000000rrrrr00000000000bbbbb

-        and     STRIDE_S, WK&s, #0xFC00            @ 0000000000000000gggggg0000000000

-        orr     WK&d, WK&d, WK&d, lsr #5           @ 00000000000-----rrrrr000000bbbbb

-        orr     WK&d, WK&d, STRIDE_S, lsr #5       @ 00000000000-----rrrrrggggggbbbbb

+        and     WK\()\d, MASK, WK\()\s, lsr #3           @ 00000000000rrrrr00000000000bbbbb

+        and     STRIDE_S, WK\()\s, #0xFC00               @ 0000000000000000gggggg0000000000

+        orr     WK\()\d, WK\()\d, WK\()\d, lsr #5        @ 00000000000-----rrrrr000000bbbbb

+        orr     WK\()\d, WK\()\d, STRIDE_S, lsr #5       @ 00000000000-----rrrrrggggggbbbbb

         /* Top 16 bits are discarded during the following STRH */

 .endm

 .macro src_x888_0565_2pixels  slo, shi, d, tmp

-        and     SCRATCH, WK&shi, #0xFC00           @ 0000000000000000GGGGGG0000000000

-        and     WK&tmp, MASK, WK&shi, lsr #3       @ 00000000000RRRRR00000000000BBBBB

-        and     WK&shi, MASK, WK&slo, lsr #3       @ 00000000000rrrrr00000000000bbbbb

-        orr     WK&tmp, WK&tmp, WK&tmp, lsr #5     @ 00000000000-----RRRRR000000BBBBB

-        orr     WK&tmp, WK&tmp, SCRATCH, lsr #5    @ 00000000000-----RRRRRGGGGGGBBBBB

-        and     SCRATCH, WK&slo, #0xFC00           @ 0000000000000000gggggg0000000000

-        orr     WK&shi, WK&shi, WK&shi, lsr #5     @ 00000000000-----rrrrr000000bbbbb

-        orr     WK&shi, WK&shi, SCRATCH, lsr #5    @ 00000000000-----rrrrrggggggbbbbb

-        pkhbt   WK&d, WK&shi, WK&tmp, lsl #16      @ RRRRRGGGGGGBBBBBrrrrrggggggbbbbb

+        and     SCRATCH, WK\()\shi, #0xFC00                 @ 0000000000000000GGGGGG0000000000

+        and     WK\()\tmp, MASK, WK\()\shi, lsr #3          @ 00000000000RRRRR00000000000BBBBB

+        and     WK\()\shi, MASK, WK\()\slo, lsr #3          @ 00000000000rrrrr00000000000bbbbb

+        orr     WK\()\tmp, WK\()\tmp, WK\()\tmp, lsr #5     @ 00000000000-----RRRRR000000BBBBB

+        orr     WK\()\tmp, WK\()\tmp, SCRATCH, lsr #5       @ 00000000000-----RRRRRGGGGGGBBBBB

+        and     SCRATCH, WK\()\slo, #0xFC00                 @ 0000000000000000gggggg0000000000

+        orr     WK\()\shi, WK\()\shi, WK\()\shi, lsr #5     @ 00000000000-----rrrrr000000bbbbb

+        orr     WK\()\shi, WK\()\shi, SCRATCH, lsr #5       @ 00000000000-----rrrrrggggggbbbbb

+        pkhbt   WK\()\d, WK\()\shi, WK\()\tmp, lsl #16      @ RRRRRGGGGGGBBBBBrrrrrggggggbbbbb

 .endm

 .macro src_x888_0565_process_head   cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload

         WK4     .req    STRIDE_S

         WK5     .req    STRIDE_M

         WK6     .req    WK3

         WK7     .req    ORIG_W

- .if numbytes == 16

+ .if \numbytes == 16

         pixld   , 16, 4, SRC, 0

         src_x888_0565_2pixels  4, 5, 0, 0

         pixld   , 8, 4, SRC, 0

         src_x888_0565_2pixels  6, 7, 1, 1

         pixld   , 8, 6, SRC, 0

  .else

-        pixld   , numbytes*2, 4, SRC, 0

+        pixld   , \numbytes*2, 4, SRC, 0

  .endif

 .endm

 .macro src_x888_0565_process_tail   cond, numbytes, firstreg

- .if numbytes == 16

+ .if \numbytes == 16

         src_x888_0565_2pixels  4, 5, 2, 2

         src_x888_0565_2pixels  6, 7, 3, 4

- .elseif numbytes == 8

+ .elseif \numbytes == 8

         src_x888_0565_2pixels  4, 5, 1, 1

         src_x888_0565_2pixels  6, 7, 2, 2

- .elseif numbytes == 4

+ .elseif \numbytes == 4

         src_x888_0565_2pixels  4, 5, 1, 1

  .else

         src_x888_0565_1pixel  4, 1

  .endif

- .if numbytes == 16

-        pixst   , numbytes, 0, DST

+ .if \numbytes == 16

+        pixst   , \numbytes, 0, DST

  .else

-        pixst   , numbytes, 1, DST

+        pixst   , \numbytes, 1, DST

  .endif

         .unreq  WK4

         .unreq  WK5

         .unreq  WK6

         .unreq  WK7

 .endm

 generate_composite_function \

@@ -377,47 +382,47 @@ generate_composite_function \

     nop_macro, /* newline */ \

     nop_macro, /* cleanup */ \

     src_x888_0565_process_head, \

     src_x888_0565_process_tail

 /******************************************************************************/

 .macro add_8_8_8pixels  cond, dst1, dst2

-        uqadd8&cond  WK&dst1, WK&dst1, MASK

-        uqadd8&cond  WK&dst2, WK&dst2, STRIDE_M

+        uqadd8\()\cond  WK\()\dst1, WK\()\dst1, MASK

+        uqadd8\()\cond  WK\()\dst2, WK\()\dst2, STRIDE_M

 .endm

 .macro add_8_8_4pixels  cond, dst

-        uqadd8&cond  WK&dst, WK&dst, MASK

+        uqadd8\()\cond  WK\()\dst, WK\()\dst, MASK

 .endm

 .macro add_8_8_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload

     WK4     .req    MASK

     WK5     .req    STRIDE_M

- .if numbytes == 16

-        pixld   cond, 8, 4, SRC, unaligned_src

-        pixld   cond, 16, firstreg, DST, 0

-        add_8_8_8pixels cond, firstreg, %(firstreg+1)

-        pixld   cond, 8, 4, SRC, unaligned_src

+ .if \numbytes == 16

+        pixld   \cond, 8, 4, SRC, \unaligned_src

+        pixld   \cond, 16, \firstreg, DST, 0

+        add_8_8_8pixels \cond, \firstreg, %(\firstreg+1)

+        pixld   \cond, 8, 4, SRC, \unaligned_src

  .else

-        pixld   cond, numbytes, 4, SRC, unaligned_src

-        pixld   cond, numbytes, firstreg, DST, 0

+        pixld   \cond, \numbytes, 4, SRC, \unaligned_src

+        pixld   \cond, \numbytes, \firstreg, DST, 0

  .endif

     .unreq  WK4

     .unreq  WK5

 .endm

 .macro add_8_8_process_tail  cond, numbytes, firstreg

- .if numbytes == 16

-        add_8_8_8pixels cond, %(firstreg+2), %(firstreg+3)

- .elseif numbytes == 8

-        add_8_8_8pixels cond, firstreg, %(firstreg+1)

+ .if \numbytes == 16

+        add_8_8_8pixels \cond, %(\firstreg+2), %(\firstreg+3)

+ .elseif \numbytes == 8

+        add_8_8_8pixels \cond, \firstreg, %(\firstreg+1)

  .else

-        add_8_8_4pixels cond, firstreg

+        add_8_8_4pixels \cond, \firstreg

  .endif

 .endm

 generate_composite_function \

     pixman_composite_add_8_8_asm_armv6, 8, 0, 8, \

     FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_PRESERVES_SCRATCH, \

     2, /* prefetch distance */ \

     nop_macro, /* init */ \

@@ -436,82 +441,82 @@ generate_composite_function \

         line_saved_regs STRIDE_D, STRIDE_S, ORIG_W

 .endm

 .macro over_8888_8888_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload

     WK4     .req    STRIDE_D

     WK5     .req    STRIDE_S

     WK6     .req    STRIDE_M

     WK7     .req    ORIG_W

-        pixld   , numbytes, %(4+firstreg), SRC, unaligned_src

-        pixld   , numbytes, firstreg, DST, 0

+        pixld   , \numbytes, %(4+\firstreg), SRC, \unaligned_src

+        pixld   , \numbytes, \firstreg, DST, 0

     .unreq  WK4

     .unreq  WK5

     .unreq  WK6

     .unreq  WK7

 .endm

 .macro over_8888_8888_check_transparent  numbytes, reg0, reg1, reg2, reg3

         /* Since these colours a premultiplied by alpha, only 0 indicates transparent (any other colour with 0 in the alpha byte is luminous) */

-        teq     WK&reg0, #0

- .if numbytes > 4

-        teqeq   WK&reg1, #0

-  .if numbytes > 8

-        teqeq   WK&reg2, #0

-        teqeq   WK&reg3, #0

+        teq     WK\()\reg0, #0

+ .if \numbytes > 4

+        teqeq   WK\()\reg1, #0

+  .if \numbytes > 8

+        teqeq   WK\()\reg2, #0

+        teqeq   WK\()\reg3, #0

   .endif

  .endif

 .endm

 .macro over_8888_8888_prepare  next

-        mov     WK&next, WK&next, lsr #24

+        mov     WK\()\next, WK\()\next, lsr #24

 .endm

 .macro over_8888_8888_1pixel src, dst, offset, next

         /* src = destination component multiplier */

-        rsb     WK&src, WK&src, #255

+        rsb     WK\()\src, WK\()\src, #255

         /* Split even/odd bytes of dst into SCRATCH/dst */

-        uxtb16  SCRATCH, WK&dst

-        uxtb16  WK&dst, WK&dst, ror #8

+        uxtb16  SCRATCH, WK\()\dst

+        uxtb16  WK\()\dst, WK\()\dst, ror #8

         /* Multiply through, adding 0.5 to the upper byte of result for rounding */

-        mla     SCRATCH, SCRATCH, WK&src, MASK

-        mla     WK&dst, WK&dst, WK&src, MASK

+        mla     SCRATCH, SCRATCH, WK\()\src, MASK

+        mla     WK\()\dst, WK\()\dst, WK\()\src, MASK

         /* Where we would have had a stall between the result of the first MLA and the shifter input,

          * reload the complete source pixel */

-        ldr     WK&src, [SRC, #offset]

+        ldr     WK\()\src, [SRC, #\offset]

         /* Multiply by 257/256 to approximate 256/255 */

         uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8

         /* In this stall, start processing the next pixel */

- .if offset < -4

-        mov     WK&next, WK&next, lsr #24

+ .if \offset < -4

+        mov     WK\()\next, WK\()\next, lsr #24

  .endif

-        uxtab16 WK&dst, WK&dst, WK&dst, ror #8

+        uxtab16 WK\()\dst, WK\()\dst, WK\()\dst, ror #8

         /* Recombine even/odd bytes of multiplied destination */

         mov     SCRATCH, SCRATCH, ror #8

-        sel     WK&dst, SCRATCH, WK&dst

+        sel     WK\()\dst, SCRATCH, WK\()\dst

         /* Saturated add of source to multiplied destination */

-        uqadd8  WK&dst, WK&dst, WK&src

+        uqadd8  WK\()\dst, WK\()\dst, WK\()\src

 .endm

 .macro over_8888_8888_process_tail  cond, numbytes, firstreg

     WK4     .req    STRIDE_D

     WK5     .req    STRIDE_S

     WK6     .req    STRIDE_M

     WK7     .req    ORIG_W

-        over_8888_8888_check_transparent numbytes, %(4+firstreg), %(5+firstreg), %(6+firstreg), %(7+firstreg)

+        over_8888_8888_check_transparent \numbytes, %(4+\firstreg), %(5+\firstreg), %(6+\firstreg), %(7+\firstreg)

         beq     10f

-        over_8888_8888_prepare  %(4+firstreg)

- .set PROCESS_REG, firstreg

- .set PROCESS_OFF, -numbytes

- .rept numbytes / 4

+        over_8888_8888_prepare  %(4+\firstreg)

+ .set PROCESS_REG, \firstreg

+ .set PROCESS_OFF, -\numbytes

+ .rept \numbytes / 4

         over_8888_8888_1pixel %(4+PROCESS_REG), %(0+PROCESS_REG), PROCESS_OFF, %(5+PROCESS_REG)

   .set PROCESS_REG, PROCESS_REG+1

   .set PROCESS_OFF, PROCESS_OFF+4

  .endr

-        pixst   , numbytes, firstreg, DST

+        pixst   , \numbytes, \firstreg, DST

10:

     .unreq  WK4

     .unreq  WK5

     .unreq  WK6

     .unreq  WK7

 .endm

 generate_composite_function \

@@ -531,26 +536,26 @@ generate_composite_function \

  * word  Register containing 4 bytes

  * byte  Register containing byte multiplier (bits 8-31 must be 0)

  * tmp   Scratch register

  * half  Register containing the constant 0x00800080

  * GE[3:0] bits must contain 0101

*/

 .macro mul_8888_8  word, byte, tmp, half

         /* Split even/odd bytes of word apart */

-        uxtb16  tmp, word

-        uxtb16  word, word, ror #8

+        uxtb16  \tmp, \word

+        uxtb16  \word, \word, ror #8

         /* Multiply bytes together with rounding, then by 257/256 */

-        mla     tmp, tmp, byte, half

-        mla     word, word, byte, half /* 1 stall follows */

-        uxtab16 tmp, tmp, tmp, ror #8  /* 1 stall follows */

-        uxtab16 word, word, word, ror #8

+        mla     \tmp, \tmp, \byte, \half

+        mla     \word, \word, \byte, \half /* 1 stall follows */

+        uxtab16 \tmp, \tmp, \tmp, ror #8  /* 1 stall follows */

+        uxtab16 \word, \word, \word, ror #8

         /* Recombine bytes */

-        mov     tmp, tmp, ror #8

-        sel     word, tmp, word

+        mov     \tmp, \tmp, ror #8

+        sel     \word, \tmp, \word

 .endm

 /******************************************************************************/

 .macro over_8888_n_8888_init

         /* Mask is constant */

         ldr     MASK, [sp, #ARGS_STACK_OFFSET+8]

         /* Hold loop invariant in STRIDE_M */

@@ -562,51 +567,51 @@ generate_composite_function \

         line_saved_regs Y, STRIDE_D, STRIDE_S, ORIG_W

 .endm

 .macro over_8888_n_8888_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload

     WK4     .req    Y

     WK5     .req    STRIDE_D

     WK6     .req    STRIDE_S

     WK7     .req    ORIG_W

-        pixld   , numbytes, %(4+(firstreg%2)), SRC, unaligned_src

-        pixld   , numbytes, firstreg, DST, 0

+        pixld   , \numbytes, %(4+(\firstreg%2)), SRC, \unaligned_src

+        pixld   , \numbytes, \firstreg, DST, 0

     .unreq  WK4

     .unreq  WK5

     .unreq  WK6

     .unreq  WK7

 .endm

 .macro over_8888_n_8888_1pixel src, dst

-        mul_8888_8  WK&src, MASK, SCRATCH, STRIDE_M

-        sub     WK7, WK6, WK&src, lsr #24

-        mul_8888_8  WK&dst, WK7, SCRATCH, STRIDE_M

-        uqadd8  WK&dst, WK&dst, WK&src

+        mul_8888_8  WK\()\src, MASK, SCRATCH, STRIDE_M

+        sub     WK7, WK6, WK\()\src, lsr #24

+        mul_8888_8  WK\()\dst, WK7, SCRATCH, STRIDE_M

+        uqadd8  WK\()\dst, WK\()\dst, WK\()\src

 .endm

 .macro over_8888_n_8888_process_tail  cond, numbytes, firstreg

     WK4     .req    Y

     WK5     .req    STRIDE_D

     WK6     .req    STRIDE_S

     WK7     .req    ORIG_W

-        over_8888_8888_check_transparent numbytes, %(4+(firstreg%2)), %(5+(firstreg%2)), %(6+firstreg), %(7+firstreg)

+        over_8888_8888_check_transparent \numbytes, %(4+(\firstreg%2)), %(5+(\firstreg%2)), %(6+\firstreg), %(7+\firstreg)

         beq     10f

         mov     WK6, #255

- .set PROCESS_REG, firstreg

- .rept numbytes / 4

-  .if numbytes == 16 && PROCESS_REG == 2

+ .set PROCESS_REG, \firstreg

+ .rept \numbytes / 4

+  .if \numbytes == 16 && PROCESS_REG == 2

         /* We're using WK6 and WK7 as temporaries, so half way through

          * 4 pixels, reload the second two source pixels but this time

          * into WK4 and WK5 */

         ldmdb   SRC, {WK4, WK5}

   .endif

         over_8888_n_8888_1pixel  %(4+(PROCESS_REG%2)), %(PROCESS_REG)

   .set PROCESS_REG, PROCESS_REG+1

  .endr

-        pixst   , numbytes, firstreg, DST

+        pixst   , \numbytes, \firstreg, DST

10:

     .unreq  WK4

     .unreq  WK5

     .unreq  WK6

     .unreq  WK7

 .endm

 generate_composite_function \

@@ -637,47 +642,47 @@ generate_composite_function \

         ldr     STRIDE_D, =0x00800080

         b       1f

  .ltorg

1:

 .endm

 .macro over_n_8_8888_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload

     WK4     .req    STRIDE_M

-        pixld   , numbytes/4, 4, MASK, unaligned_mask

-        pixld   , numbytes, firstreg, DST, 0

+        pixld   , \numbytes/4, 4, MASK, \unaligned_mask

+        pixld   , \numbytes, \firstreg, DST, 0

     .unreq  WK4

 .endm

 .macro over_n_8_8888_1pixel src, dst

-        uxtb    Y, WK4, ror #src*8

+        uxtb    Y, WK4, ror #\src*8

         /* Trailing part of multiplication of source */

         mla     SCRATCH, STRIDE_S, Y, STRIDE_D

         mla     Y, SRC, Y, STRIDE_D

         mov     ORIG_W, #255

         uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8

         uxtab16 Y, Y, Y, ror #8

         mov     SCRATCH, SCRATCH, ror #8

         sub     ORIG_W, ORIG_W, Y, lsr #24

         sel     Y, SCRATCH, Y

         /* Then multiply the destination */

-        mul_8888_8  WK&dst, ORIG_W, SCRATCH, STRIDE_D

-        uqadd8  WK&dst, WK&dst, Y

+        mul_8888_8  WK\()\dst, ORIG_W, SCRATCH, STRIDE_D

+        uqadd8  WK\()\dst, WK\()\dst, Y

 .endm

 .macro over_n_8_8888_process_tail  cond, numbytes, firstreg

     WK4     .req    STRIDE_M

         teq     WK4, #0

         beq     10f

- .set PROCESS_REG, firstreg

- .rept numbytes / 4

-        over_n_8_8888_1pixel  %(PROCESS_REG-firstreg), %(PROCESS_REG)

+ .set PROCESS_REG, \firstreg

+ .rept \numbytes / 4

+        over_n_8_8888_1pixel  %(PROCESS_REG-\firstreg), %(PROCESS_REG)

   .set PROCESS_REG, PROCESS_REG+1

  .endr

-        pixst   , numbytes, firstreg, DST

+        pixst   , \numbytes, \firstreg, DST

10:

     .unreq  WK4

 .endm

 generate_composite_function \

     pixman_composite_over_n_8_8888_asm_armv6, 0, 8, 32 \

     FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS \

     2, /* prefetch distance */ \

@@ -700,64 +705,64 @@ generate_composite_function \

         line_saved_regs  STRIDE_D, ORIG_W

 .endm

 .macro over_reverse_n_8888_newline

         mov     STRIDE_D, #0xFF

 .endm

 .macro over_reverse_n_8888_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload

-        pixld   , numbytes, firstreg, DST, 0

+        pixld   , \numbytes, \firstreg, DST, 0

 .endm

 .macro over_reverse_n_8888_1pixel  d, is_only

-        teq     WK&d, #0

+        teq     WK\()\d, #0

         beq     8f       /* replace with source */

-        bics    ORIG_W, STRIDE_D, WK&d, lsr #24

- .if is_only == 1

+        bics    ORIG_W, STRIDE_D, WK\()\d, lsr #24

+ .if \is_only == 1

         beq     49f      /* skip store */

  .else

         beq     9f       /* write same value back */

  .endif

         mla     SCRATCH, STRIDE_S, ORIG_W, MASK /* red/blue */

         mla     ORIG_W, STRIDE_M, ORIG_W, MASK  /* alpha/green */

         uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8

         uxtab16 ORIG_W, ORIG_W, ORIG_W, ror #8

         mov     SCRATCH, SCRATCH, ror #8

         sel     ORIG_W, SCRATCH, ORIG_W

-        uqadd8  WK&d, WK&d, ORIG_W

+        uqadd8  WK\()\d, WK\()\d, ORIG_W

         b       9f

-8:      mov     WK&d, SRC

+8:      mov     WK\()\d, SRC

9:

 .endm

 .macro over_reverse_n_8888_tail  numbytes, reg1, reg2, reg3, reg4

- .if numbytes == 4

-        over_reverse_n_8888_1pixel  reg1, 1

+ .if \numbytes == 4

+        over_reverse_n_8888_1pixel  \reg1, 1

  .else

-        and     SCRATCH, WK&reg1, WK&reg2

-  .if numbytes == 16

-        and     SCRATCH, SCRATCH, WK&reg3

-        and     SCRATCH, SCRATCH, WK&reg4

+        and     SCRATCH, WK\()\reg1, WK\()\reg2

+  .if \numbytes == 16

+        and     SCRATCH, SCRATCH, WK\()\reg3

+        and     SCRATCH, SCRATCH, WK\()\reg4

   .endif

         mvns    SCRATCH, SCRATCH, asr #24

         beq     49f /* skip store if all opaque */

-        over_reverse_n_8888_1pixel  reg1, 0

-        over_reverse_n_8888_1pixel  reg2, 0

-  .if numbytes == 16

-        over_reverse_n_8888_1pixel  reg3, 0

-        over_reverse_n_8888_1pixel  reg4, 0

+        over_reverse_n_8888_1pixel  \reg1, 0

+        over_reverse_n_8888_1pixel  \reg2, 0

+  .if \numbytes == 16

+        over_reverse_n_8888_1pixel  \reg3, 0

+        over_reverse_n_8888_1pixel  \reg4, 0

   .endif

  .endif

-        pixst   , numbytes, reg1, DST

+        pixst   , \numbytes, \reg1, DST

49:

 .endm

 .macro over_reverse_n_8888_process_tail  cond, numbytes, firstreg

-        over_reverse_n_8888_tail  numbytes, firstreg, %(firstreg+1), %(firstreg+2), %(firstreg+3)

+        over_reverse_n_8888_tail  \numbytes, \firstreg, %(\firstreg+1), %(\firstreg+2), %(\firstreg+3)

 .endm

 generate_composite_function \

     pixman_composite_over_reverse_n_8888_asm_armv6, 0, 0, 32 \

     FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH, \

     3, /* prefetch distance */ \

     over_reverse_n_8888_init, \

     over_reverse_n_8888_newline, \

@@ -789,30 +794,30 @@ generate_composite_function \

         .unreq  TMP1

         .unreq  TMP2

         .unreq  TMP3

         .unreq  WK4

 .endm

 .macro over_white_8888_8888_ca_combine  m, d

         uxtb16  TMP1, TMP0                /* rb_notmask */

-        uxtb16  TMP2, d                   /* rb_dest; 1 stall follows */

+        uxtb16  TMP2, \d                  /* rb_dest; 1 stall follows */

         smlatt  TMP3, TMP2, TMP1, HALF    /* red */

         smlabb  TMP2, TMP2, TMP1, HALF    /* blue */

         uxtb16  TMP0, TMP0, ror #8        /* ag_notmask */

-        uxtb16  TMP1, d, ror #8           /* ag_dest; 1 stall follows */

-        smlatt  d, TMP1, TMP0, HALF       /* alpha */

+        uxtb16  TMP1, \d, ror #8          /* ag_dest; 1 stall follows */

+        smlatt  \d, TMP1, TMP0, HALF      /* alpha */

         smlabb  TMP1, TMP1, TMP0, HALF    /* green */

         pkhbt   TMP0, TMP2, TMP3, lsl #16 /* rb; 1 stall follows */

-        pkhbt   TMP1, TMP1, d, lsl #16    /* ag */

+        pkhbt   TMP1, TMP1, \d, lsl #16   /* ag */

         uxtab16 TMP0, TMP0, TMP0, ror #8

         uxtab16 TMP1, TMP1, TMP1, ror #8

         mov     TMP0, TMP0, ror #8

-        sel     d, TMP0, TMP1

-        uqadd8  d, d, m                   /* d is a late result */

+        sel     \d, TMP0, TMP1

+        uqadd8  \d, \d, \m                 /* d is a late result */

 .endm

 .macro over_white_8888_8888_ca_1pixel_head

         pixld   , 4, 1, MASK, 0

         pixld   , 4, 3, DST, 0

 .endm

 .macro over_white_8888_8888_ca_1pixel_tail

@@ -848,29 +853,29 @@ 02:     mvn     TMP0, WK2

         movcs   WK4, WK2

         b       04f

 03:     over_white_8888_8888_ca_combine WK2, WK4

 04:     pixst   , 8, 3, DST

05:

 .endm

 .macro over_white_8888_8888_ca_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload

- .if numbytes == 4

+ .if \numbytes == 4

         over_white_8888_8888_ca_1pixel_head

  .else

-  .if numbytes == 16

+  .if \numbytes == 16

         over_white_8888_8888_ca_2pixels_head

         over_white_8888_8888_ca_2pixels_tail

   .endif

         over_white_8888_8888_ca_2pixels_head

  .endif

 .endm

 .macro over_white_8888_8888_ca_process_tail  cond, numbytes, firstreg

- .if numbytes == 4

+ .if \numbytes == 4

         over_white_8888_8888_ca_1pixel_tail

  .else

         over_white_8888_8888_ca_2pixels_tail

  .endif

 .endm

 generate_composite_function \

     pixman_composite_over_white_8888_8888_ca_asm_armv6, 0, 32, 32 \

@@ -999,33 +1004,33 @@ 20:     /* No simplifications possible -

         uqadd8  WK0, WK1, WK2            /* followed by 1 stall */

 30:     /* The destination buffer is already in the L1 cache, so

          * there's little point in amalgamating writes */

         pixst   , 4, 0, DST

40:

 .endm

 .macro over_n_8888_8888_ca_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload

- .rept (numbytes / 4) - 1

+ .rept (\numbytes / 4) - 1

         over_n_8888_8888_ca_1pixel_head

         over_n_8888_8888_ca_1pixel_tail

  .endr

         over_n_8888_8888_ca_1pixel_head

 .endm

 .macro over_n_8888_8888_ca_process_tail  cond, numbytes, firstreg

         over_n_8888_8888_ca_1pixel_tail

 .endm

 pixman_asm_function pixman_composite_over_n_8888_8888_ca_asm_armv6

         ldr     ip, [sp]

         cmp     ip, #-1

         beq     pixman_composite_over_white_8888_8888_ca_asm_armv6

         /* else drop through... */

- .endfunc

+ pixman_end_asm_function

 generate_composite_function \

     pixman_composite_over_n_8888_8888_ca_asm_armv6_helper, 0, 32, 32 \

     FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH | FLAG_PROCESS_CORRUPTS_WK0 \

     2, /* prefetch distance */ \

     over_n_8888_8888_ca_init, \

     nop_macro, /* newline */ \

     over_n_8888_8888_ca_cleanup, \

     over_n_8888_8888_ca_process_head, \

@@ -1040,94 +1045,94 @@ generate_composite_function \

         uadd8   SCRATCH, MASK, MASK

         /* Offset the source pointer: we only need the alpha bytes */

         add     SRC, SRC, #3

         line_saved_regs  ORIG_W

 .endm

 .macro in_reverse_8888_8888_head  numbytes, reg1, reg2, reg3

         ldrb    ORIG_W, [SRC], #4

- .if numbytes >= 8

-        ldrb    WK&reg1, [SRC], #4

-  .if numbytes == 16

-        ldrb    WK&reg2, [SRC], #4

-        ldrb    WK&reg3, [SRC], #4

+ .if \numbytes >= 8

+        ldrb    WK\()\reg1, [SRC], #4

+  .if \numbytes == 16

+        ldrb    WK\()\reg2, [SRC], #4

+        ldrb    WK\()\reg3, [SRC], #4

   .endif

  .endif

-        add     DST, DST, #numbytes

+        add     DST, DST, #\numbytes

 .endm

 .macro in_reverse_8888_8888_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload

-        in_reverse_8888_8888_head  numbytes, firstreg, %(firstreg+1), %(firstreg+2)

+        in_reverse_8888_8888_head  \numbytes, \firstreg, %(\firstreg+1), %(\firstreg+2)

 .endm

 .macro in_reverse_8888_8888_1pixel  s, d, offset, is_only

- .if is_only != 1

-        movs    s, ORIG_W

-  .if offset != 0

-        ldrb    ORIG_W, [SRC, #offset]

+ .if \is_only != 1

+        movs    \s, ORIG_W

+  .if \offset != 0

+        ldrb    ORIG_W, [SRC, #\offset]

   .endif

         beq     01f

         teq     STRIDE_M, #0xFF

         beq     02f

  .endif

-        uxtb16  SCRATCH, d                 /* rb_dest */

-        uxtb16  d, d, ror #8               /* ag_dest */

-        mla     SCRATCH, SCRATCH, s, MASK

-        mla     d, d, s, MASK

+        uxtb16  SCRATCH, \d                 /* rb_dest */

+        uxtb16  \d, \d, ror #8               /* ag_dest */

+        mla     SCRATCH, SCRATCH, \s, MASK

+        mla     \d, \d, \s, MASK

         uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8

-        uxtab16 d, d, d, ror #8

+        uxtab16 \d, \d, \d, ror #8

         mov     SCRATCH, SCRATCH, ror #8

-        sel     d, SCRATCH, d

+        sel     \d, SCRATCH, \d

         b       02f

- .if offset == 0

+ .if \offset == 0

 48:     /* Last mov d,#0 of the set - used as part of shortcut for

          * source values all 0 */

  .endif

-01:     mov     d, #0

+01:     mov     \d, #0

02:

 .endm

 .macro in_reverse_8888_8888_tail  numbytes, reg1, reg2, reg3, reg4

- .if numbytes == 4

+ .if \numbytes == 4

         teq     ORIG_W, ORIG_W, asr #32

-        ldrne   WK&reg1, [DST, #-4]

- .elseif numbytes == 8

-        teq     ORIG_W, WK&reg1

+        ldrne   WK\()\reg1, [DST, #-4]

+ .elseif \numbytes == 8

+        teq     ORIG_W, WK\()\reg1

         teqeq   ORIG_W, ORIG_W, asr #32  /* all 0 or all -1? */

-        ldmnedb DST, {WK&reg1-WK&reg2}

+        ldmnedb DST, {WK\()\reg1-WK\()\reg2}

  .else

-        teq     ORIG_W, WK&reg1

-        teqeq   ORIG_W, WK&reg2

-        teqeq   ORIG_W, WK&reg3

+        teq     ORIG_W, WK\()\reg1

+        teqeq   ORIG_W, WK\()\reg2

+        teqeq   ORIG_W, WK\()\reg3

         teqeq   ORIG_W, ORIG_W, asr #32  /* all 0 or all -1? */

-        ldmnedb DST, {WK&reg1-WK&reg4}

+        ldmnedb DST, {WK\()\reg1-WK\()\reg4}

  .endif

         cmnne   DST, #0   /* clear C if NE */

         bcs     49f       /* no writes to dest if source all -1 */

         beq     48f       /* set dest to all 0 if source all 0 */

- .if numbytes == 4

-        in_reverse_8888_8888_1pixel  ORIG_W, WK&reg1, 0, 1

-        str     WK&reg1, [DST, #-4]

- .elseif numbytes == 8

-        in_reverse_8888_8888_1pixel  STRIDE_M, WK&reg1, -4, 0

-        in_reverse_8888_8888_1pixel  STRIDE_M, WK&reg2, 0, 0

-        stmdb   DST, {WK&reg1-WK&reg2}

+ .if \numbytes == 4

+        in_reverse_8888_8888_1pixel  ORIG_W, WK\()\reg1, 0, 1

+        str     WK\()\reg1, [DST, #-4]

+ .elseif \numbytes == 8

+        in_reverse_8888_8888_1pixel  STRIDE_M, WK\()\reg1, -4, 0

+        in_reverse_8888_8888_1pixel  STRIDE_M, WK\()\reg2, 0, 0

+        stmdb   DST, {WK\()\reg1-WK\()\reg2}

  .else

-        in_reverse_8888_8888_1pixel  STRIDE_M, WK&reg1, -12, 0

-        in_reverse_8888_8888_1pixel  STRIDE_M, WK&reg2, -8, 0

-        in_reverse_8888_8888_1pixel  STRIDE_M, WK&reg3, -4, 0

-        in_reverse_8888_8888_1pixel  STRIDE_M, WK&reg4, 0, 0

-        stmdb   DST, {WK&reg1-WK&reg4}

+        in_reverse_8888_8888_1pixel  STRIDE_M, WK\()\reg1, -12, 0

+        in_reverse_8888_8888_1pixel  STRIDE_M, WK\()\reg2, -8, 0

+        in_reverse_8888_8888_1pixel  STRIDE_M, WK\()\reg3, -4, 0

+        in_reverse_8888_8888_1pixel  STRIDE_M, WK\()\reg4, 0, 0

+        stmdb   DST, {WK\()\reg1-WK\()\reg4}

  .endif

49:

 .endm

 .macro in_reverse_8888_8888_process_tail  cond, numbytes, firstreg

-        in_reverse_8888_8888_tail  numbytes, firstreg, %(firstreg+1), %(firstreg+2), %(firstreg+3)

+        in_reverse_8888_8888_tail  \numbytes, \firstreg, %(\firstreg+1), %(\firstreg+2), %(\firstreg+3)

 .endm

 generate_composite_function \

     pixman_composite_in_reverse_8888_8888_asm_armv6, 32, 0, 32 \

     FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH | FLAG_NO_PRELOAD_DST \

     2, /* prefetch distance */ \

     in_reverse_8888_8888_init, \

     nop_macro, /* newline */ \

@@ -1144,31 +1149,31 @@ generate_composite_function \

         /* Hold multiplier for destination in STRIDE_M */

         mov     STRIDE_M, #255

         sub     STRIDE_M, STRIDE_M, SRC, lsr #24

         /* Set GE[3:0] to 0101 so SEL instructions do what we want */

         uadd8   SCRATCH, MASK, MASK

 .endm

 .macro over_n_8888_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload

-        pixld   , numbytes, firstreg, DST, 0

+        pixld   , \numbytes, \firstreg, DST, 0

 .endm

 .macro over_n_8888_1pixel dst

-        mul_8888_8  WK&dst, STRIDE_M, SCRATCH, MASK

-        uqadd8  WK&dst, WK&dst, SRC

+        mul_8888_8  WK\()\dst, STRIDE_M, SCRATCH, MASK

+        uqadd8  WK\()\dst, WK\()\dst, SRC

 .endm

 .macro over_n_8888_process_tail  cond, numbytes, firstreg

- .set PROCESS_REG, firstreg

- .rept numbytes / 4

+ .set PROCESS_REG, \firstreg

+ .rept \numbytes / 4

         over_n_8888_1pixel %(PROCESS_REG)

   .set PROCESS_REG, PROCESS_REG+1

  .endr

-        pixst   , numbytes, firstreg, DST

+        pixst   , \numbytes, \firstreg, DST

 .endm

 generate_composite_function \

     pixman_composite_over_n_8888_asm_armv6, 0, 0, 32 \

     FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_DOES_STORE \

     2, /* prefetch distance */ \

     over_n_8888_init, \

     nop_macro, /* newline */ \

diff --git a/gfx/cairo/libpixman/src/pixman-arm-simd-asm.h b/gfx/cairo/libpixman/src/pixman-arm-simd-asm.h

--- a/gfx/cairo/libpixman/src/pixman-arm-simd-asm.h

+++ b/gfx/cairo/libpixman/src/pixman-arm-simd-asm.h

@@ -107,88 +107,120 @@

 .set PREFETCH_TYPE_NONE,       0

 .set PREFETCH_TYPE_STANDARD,   1

/*

  * Definitions of macros for load/store of pixel data.

*/

 .macro pixldst op, cond=al, numbytes, reg0, reg1, reg2, reg3, base, unaligned=0

- .if numbytes == 16

-  .if unaligned == 1

-        op&r&cond    WK&reg0, [base], #4

-        op&r&cond    WK&reg1, [base], #4

-        op&r&cond    WK&reg2, [base], #4

-        op&r&cond    WK&reg3, [base], #4

+ .if \numbytes == 16

+  .if \unaligned == 1

+        \op\()r\()\cond    WK\()\reg0, [\base], #4

+        \op\()r\()\cond    WK\()\reg1, [\base], #4

+        \op\()r\()\cond    WK\()\reg2, [\base], #4

+        \op\()r\()\cond    WK\()\reg3, [\base], #4

   .else

-        op&m&cond&ia base!, {WK&reg0,WK&reg1,WK&reg2,WK&reg3}

+#ifdef __clang__

+        \op\()mia\()\cond  \base!, {WK\()\reg0,WK\()\reg1,WK\()\reg2,WK\()\reg3}

+#else

+        \op\()m\()\cond\()ia  \base!, {WK\()\reg0,WK\()\reg1,WK\()\reg2,WK\()\reg3}

+#endif

   .endif

- .elseif numbytes == 8

-  .if unaligned == 1

-        op&r&cond    WK&reg0, [base], #4

-        op&r&cond    WK&reg1, [base], #4

+ .elseif \numbytes == 8

+  .if \unaligned == 1

+        \op\()r\()\cond    WK\()\reg0, [\base], #4

+        \op\()r\()\cond    WK\()\reg1, [\base], #4

   .else

-        op&m&cond&ia base!, {WK&reg0,WK&reg1}

+#ifdef __clang__

+        \op\()mia\()\cond  \base!, {WK\()\reg0,WK\()\reg1}

+#else

+        \op\()m\()\cond\()ia  \base!, {WK\()\reg0,WK\()\reg1}

+#endif

   .endif

- .elseif numbytes == 4

-        op&r&cond    WK&reg0, [base], #4

- .elseif numbytes == 2

-        op&r&cond&h  WK&reg0, [base], #2

- .elseif numbytes == 1

-        op&r&cond&b  WK&reg0, [base], #1

+ .elseif \numbytes == 4

+        \op\()r\()\cond    WK\()\reg0, [\base], #4

+ .elseif \numbytes == 2

+#ifdef __clang__

+        \op\()rh\()\cond   WK\()\reg0, [\base], #2

+#else

+        \op\()r\()\cond\()h   WK\()\reg0, [\base], #2

+#endif

+ .elseif \numbytes == 1

+#ifdef __clang__

+        \op\()rb\()\cond   WK\()\reg0, [\base], #1

+#else

+        \op\()r\()\cond\()b   WK\()\reg0, [\base], #1

+#endif

  .else

-  .error "unsupported size: numbytes"

+  .error "unsupported size: \numbytes"

  .endif

 .endm

 .macro pixst_baseupdated cond, numbytes, reg0, reg1, reg2, reg3, base

- .if numbytes == 16

-        stm&cond&db base, {WK&reg0,WK&reg1,WK&reg2,WK&reg3}

- .elseif numbytes == 8

-        stm&cond&db base, {WK&reg0,WK&reg1}

- .elseif numbytes == 4

-        str&cond    WK&reg0, [base, #-4]

- .elseif numbytes == 2

-        str&cond&h  WK&reg0, [base, #-2]

- .elseif numbytes == 1

-        str&cond&b  WK&reg0, [base, #-1]

+ .if \numbytes == 16

+#ifdef __clang__

+        stm\()\cond\()db \base, {WK\()\reg0,WK\()\reg1,WK\()\reg2,WK\()\reg3}

+#else

+        stmdb\()\cond \base, {WK\()\reg0,WK\()\reg1,WK\()\reg2,WK\()\reg3}

+#endif

+ .elseif \numbytes == 8

+#ifdef __clang__

+        stmdb\()\cond \base, {WK\()\reg0,WK\()\reg1}

+#else

+        stm\()\cond\()db \base, {WK\()\reg0,WK\()\reg1}

+#endif

+ .elseif \numbytes == 4

+        str\()\cond    WK\()\reg0, [\base, #-4]

+ .elseif \numbytes == 2

+#ifdef __clang__

+        strh\()\cond   WK\()\reg0, [\base, #-2]

+#else

+        str\()\cond\()h   WK\()\reg0, [\base, #-2]

+#endif

+ .elseif \numbytes == 1

+#ifdef __clang__

+        strb\()\cond   WK\()\reg0, [\base, #-1]

+#else

+        str\()\cond\()b   WK\()\reg0, [\base, #-1]

+#endif

  .else

-  .error "unsupported size: numbytes"

+  .error "unsupported size: \numbytes"

  .endif

 .endm

 .macro pixld cond, numbytes, firstreg, base, unaligned

-        pixldst ld, cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base, unaligned

+        pixldst ld, \cond, \numbytes, %(\firstreg+0), %(\firstreg+1), %(\firstreg+2), %(\firstreg+3), \base, \unaligned

 .endm

 .macro pixst cond, numbytes, firstreg, base

  .if (flags) & FLAG_DST_READWRITE

-        pixst_baseupdated cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base

+        pixst_baseupdated \cond, \numbytes, %(\firstreg+0), %(\firstreg+1), %(\firstreg+2), %(\firstreg+3), \base

  .else

-        pixldst st, cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base

+        pixldst st, \cond, \numbytes, %(\firstreg+0), %(\firstreg+1), %(\firstreg+2), %(\firstreg+3), \base

  .endif

 .endm

 .macro PF a, x:vararg

  .if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_STANDARD)

-        a x

+        \a \x

  .endif

 .endm

 .macro preload_leading_step1  bpp, ptr, base

 /* If the destination is already 16-byte aligned, then we need to preload

  * between 0 and prefetch_distance (inclusive) cache lines ahead so there

  * are no gaps when the inner loop starts.

*/

- .if bpp > 0

-        PF  bic,    ptr, base, #31

+ .if \bpp > 0

+        PF  bic,    \ptr, \base, #31

   .set OFFSET, 0

   .rept prefetch_distance+1

-        PF  pld,    [ptr, #OFFSET]

+        PF  pld,    [\ptr, #OFFSET]

    .set OFFSET, OFFSET+32

   .endr

  .endif

 .endm

 .macro preload_leading_step2  bpp, bpp_shift, ptr, base

 /* However, if the destination is not 16-byte aligned, we may need to

  * preload more cache lines than that. The question we need to ask is:

@@ -196,81 +228,81 @@

  * by which the source pointer will be rounded down for preloading, and if

  * so, by how many cache lines? Effectively, we want to calculate

  *     leading_bytes = ((-dst)&15)*src_bpp/dst_bpp

  *     inner_loop_offset = (src+leading_bytes)&31

  *     extra_needed = leading_bytes - inner_loop_offset

  * and test if extra_needed is <= 0, <= 32, or > 32 (where > 32 is only

  * possible when there are 4 src bytes for every 1 dst byte).

*/

- .if bpp > 0

-  .ifc base,DST

+ .if \bpp > 0

+  .ifc \base,DST

         /* The test can be simplified further when preloading the destination */

-        PF  tst,    base, #16

+        PF  tst,    \base, #16

         PF  beq,    61f

   .else

-   .if bpp/dst_w_bpp == 4

-        PF  add,    SCRATCH, base, WK0, lsl #bpp_shift-dst_bpp_shift

+   .if \bpp/dst_w_bpp == 4

+        PF  add,    SCRATCH, \base, WK0, lsl #\bpp_shift-dst_bpp_shift

         PF  and,    SCRATCH, SCRATCH, #31

-        PF  rsb,    SCRATCH, SCRATCH, WK0, lsl #bpp_shift-dst_bpp_shift

+        PF  rsb,    SCRATCH, SCRATCH, WK0, lsl #\bpp_shift-dst_bpp_shift

         PF  sub,    SCRATCH, SCRATCH, #1        /* so now ranges are -16..-1 / 0..31 / 32..63 */

         PF  movs,   SCRATCH, SCRATCH, lsl #32-6 /* so this sets         NC   /  nc   /   Nc   */

         PF  bcs,    61f

         PF  bpl,    60f

         PF  pld,    [ptr, #32*(prefetch_distance+2)]

    .else

-        PF  mov,    SCRATCH, base, lsl #32-5

-        PF  add,    SCRATCH, SCRATCH, WK0, lsl #32-5+bpp_shift-dst_bpp_shift

-        PF  rsbs,   SCRATCH, SCRATCH, WK0, lsl #32-5+bpp_shift-dst_bpp_shift

+        PF  mov,    SCRATCH, \base, lsl #32-5

+        PF  add,    SCRATCH, SCRATCH, WK0, lsl #32-5+\bpp_shift-dst_bpp_shift

+        PF  rsbs,   SCRATCH, SCRATCH, WK0, lsl #32-5+\bpp_shift-dst_bpp_shift

         PF  bls,    61f

    .endif

   .endif

-60:     PF  pld,    [ptr, #32*(prefetch_distance+1)]

+60:     PF  pld,    [\ptr, #32*(prefetch_distance+1)]

61:

  .endif

 .endm

 #define IS_END_OF_GROUP(INDEX,SIZE) ((SIZE) < 2 || ((INDEX) & ~((INDEX)+1)) & ((SIZE)/2))

 .macro preload_middle   bpp, base, scratch_holds_offset

- .if bpp > 0

+ .if \bpp > 0

         /* prefetch distance = 256/bpp, stm distance = 128/dst_w_bpp */

-  .if IS_END_OF_GROUP(SUBBLOCK,256/128*dst_w_bpp/bpp)

-   .if scratch_holds_offset

-        PF  pld,    [base, SCRATCH]

+  .if IS_END_OF_GROUP(SUBBLOCK,256/128*dst_w_bpp/\bpp)

+   .if \scratch_holds_offset

+        PF  pld,    [\base, SCRATCH]

    .else

-        PF  bic,    SCRATCH, base, #31

+        PF  bic,    SCRATCH, \base, #31

         PF  pld,    [SCRATCH, #32*prefetch_distance]

    .endif

   .endif

  .endif

 .endm

 .macro preload_trailing  bpp, bpp_shift, base

- .if bpp > 0

-  .if bpp*pix_per_block > 256

+ .if \bpp > 0

+  .if \bpp*pix_per_block > 256

         /* Calculations are more complex if more than one fetch per block */

-        PF  and,    WK1, base, #31

-        PF  add,    WK1, WK1, WK0, lsl #bpp_shift

-        PF  add,    WK1, WK1, #32*(bpp*pix_per_block/256-1)*(prefetch_distance+1)

-        PF  bic,    SCRATCH, base, #31

+        PF  and,    WK1, \base, #31

+        PF  add,    WK1, WK1, WK0, lsl #\bpp_shift

+        PF  add,    WK1, WK1, #32*(\bpp*pix_per_block/256-1)*(prefetch_distance+1)

+        PF  bic,    SCRATCH, \base, #31

 80:     PF  pld,    [SCRATCH, #32*(prefetch_distance+1)]

         PF  add,    SCRATCH, SCRATCH, #32

         PF  subs,   WK1, WK1, #32

         PF  bhi,    80b

   .else

         /* If exactly one fetch per block, then we need either 0, 1 or 2 extra preloads */

-        PF  mov,    SCRATCH, base, lsl #32-5

-        PF  adds,   SCRATCH, SCRATCH, X, lsl #32-5+bpp_shift

+        PF  mov,    SCRATCH, \base, lsl #32-5

+        PF  adds,   SCRATCH, SCRATCH, X, lsl #32-5+\bpp_shift

         PF  adceqs, SCRATCH, SCRATCH, #0

         /* The instruction above has two effects: ensures Z is only

          * set if C was clear (so Z indicates that both shifted quantities

          * were 0), and clears C if Z was set (so C indicates that the sum

          * of the shifted quantities was greater and not equal to 32) */

         PF  beq,    82f

-        PF  bic,    SCRATCH, base, #31

+        PF  bic,    SCRATCH, \base, #31

         PF  bcc,    81f

         PF  pld,    [SCRATCH, #32*(prefetch_distance+2)]

 81:     PF  pld,    [SCRATCH, #32*(prefetch_distance+1)]

82:

   .endif

  .endif

 .endm

@@ -283,97 +315,97 @@ 82:

  *    pixels) they cannot possibly straddle more than 2 32-byte cachelines,

  *    meaning there's no need for a loop.

  * "bpp" - number of bits per pixel in the channel (source, mask or

  *    destination) that's being preloaded, or 0 if this channel is not used

  *    for reading

  * "bpp_shift" - log2 of ("bpp"/8) (except if "bpp"=0 of course)

  * "base" - base address register of channel to preload (SRC, MASK or DST)

*/

- .if bpp > 0

-  .if narrow_case && (bpp <= dst_w_bpp)

+ .if \bpp > 0

+  .if \narrow_case && (\bpp <= dst_w_bpp)

         /* In these cases, each line for each channel is in either 1 or 2 cache lines */

-        PF  bic,    WK0, base, #31

+        PF  bic,    WK0, \base, #31

         PF  pld,    [WK0]

-        PF  add,    WK1, base, X, LSL #bpp_shift

+        PF  add,    WK1, \base, X, LSL #\bpp_shift

         PF  sub,    WK1, WK1, #1

         PF  bic,    WK1, WK1, #31

         PF  cmp,    WK1, WK0

         PF  beq,    90f

         PF  pld,    [WK1]

90:

   .else

-        PF  bic,    WK0, base, #31

+        PF  bic,    WK0, \base, #31

         PF  pld,    [WK0]

-        PF  add,    WK1, base, X, lsl #bpp_shift

+        PF  add,    WK1, \base, X, lsl #\bpp_shift

         PF  sub,    WK1, WK1, #1

         PF  bic,    WK1, WK1, #31

         PF  cmp,    WK1, WK0

         PF  beq,    92f

 91:     PF  add,    WK0, WK0, #32

         PF  cmp,    WK0, WK1

         PF  pld,    [WK0]

         PF  bne,    91b

92:

   .endif

  .endif

 .endm

 .macro conditional_process1_helper  cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx

-        process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, 0

- .if decrementx

-        sub&cond X, X, #8*numbytes/dst_w_bpp

+        \process_head  \cond, \numbytes, \firstreg, \unaligned_src, \unaligned_mask, 0

+ .if \decrementx

+        sub\()\cond X, X, #8*\numbytes/dst_w_bpp

  .endif

-        process_tail  cond, numbytes, firstreg

+        \process_tail  \cond, \numbytes, \firstreg

  .if !((flags) & FLAG_PROCESS_DOES_STORE)

-        pixst   cond, numbytes, firstreg, DST

+        pixst   \cond, \numbytes, \firstreg, DST

  .endif

 .endm

 .macro conditional_process1  cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx

  .if (flags) & FLAG_BRANCH_OVER

-  .ifc cond,mi

+  .ifc \cond,mi

         bpl     100f

   .endif

-  .ifc cond,cs

+  .ifc \cond,cs

         bcc     100f

   .endif

-  .ifc cond,ne

+  .ifc \cond,ne

         beq     100f

   .endif

-        conditional_process1_helper  , process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx

+        conditional_process1_helper  , \process_head, \process_tail, \numbytes, \firstreg, \unaligned_src, \unaligned_mask, \decrementx

 100:

  .else

-        conditional_process1_helper  cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx

+        conditional_process1_helper  \cond, \process_head, \process_tail, \numbytes, \firstreg, \unaligned_src, \unaligned_mask, \decrementx

  .endif

 .endm

 .macro conditional_process2  test, cond1, cond2, process_head, process_tail, numbytes1, numbytes2, firstreg1, firstreg2, unaligned_src, unaligned_mask, decrementx

  .if (flags) & (FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE)

         /* Can't interleave reads and writes */

-        test

-        conditional_process1  cond1, process_head, process_tail, numbytes1, firstreg1, unaligned_src, unaligned_mask, decrementx

+        \test

+        conditional_process1  \cond1, \process_head, \process_tail, \numbytes1, \firstreg1, \unaligned_src, \unaligned_mask, \decrementx

   .if (flags) & FLAG_PROCESS_CORRUPTS_PSR

-        test

+        \test

   .endif

-        conditional_process1  cond2, process_head, process_tail, numbytes2, firstreg2, unaligned_src, unaligned_mask, decrementx

+        conditional_process1  \cond2, \process_head, \process_tail, \numbytes2, \firstreg2, \unaligned_src, \unaligned_mask, \decrementx

  .else

         /* Can interleave reads and writes for better scheduling */

-        test

-        process_head  cond1, numbytes1, firstreg1, unaligned_src, unaligned_mask, 0

-        process_head  cond2, numbytes2, firstreg2, unaligned_src, unaligned_mask, 0

-  .if decrementx

-        sub&cond1 X, X, #8*numbytes1/dst_w_bpp

-        sub&cond2 X, X, #8*numbytes2/dst_w_bpp

+        \test

+        \process_head  \cond1, \numbytes1, \firstreg1, \unaligned_src, \unaligned_mask, 0

+        \process_head  \cond2, \numbytes2, \firstreg2, \unaligned_src, \unaligned_mask, 0

+  .if \decrementx

+        sub\()\cond1 X, X, #8*\numbytes1/dst_w_bpp

+        sub\()\cond2 X, X, #8*\numbytes2/dst_w_bpp

   .endif

-        process_tail  cond1, numbytes1, firstreg1

-        process_tail  cond2, numbytes2, firstreg2

-        pixst   cond1, numbytes1, firstreg1, DST

-        pixst   cond2, numbytes2, firstreg2, DST

+        \process_tail  \cond1, \numbytes1, \firstreg1

+        \process_tail  \cond2, \numbytes2, \firstreg2

+        pixst   \cond1, \numbytes1, \firstreg1, DST

+        pixst   \cond2, \numbytes2, \firstreg2, DST

  .endif

 .endm

 .macro test_bits_1_0_ptr

  .if (flags) & FLAG_PROCESS_CORRUPTS_WK0

         movs    SCRATCH, X, lsl #32-1  /* C,N = bits 1,0 of DST */

  .else

@@ -395,22 +427,22 @@ 100:

  .if (flags) & FLAG_PROCESS_CORRUPTS_WK0

   .set DECREMENT_X, 0

         sub     X, X, WK0, lsr #dst_bpp_shift

         str     X, [sp, #LINE_SAVED_REG_COUNT*4]

         mov     X, WK0

  .endif

         /* Use unaligned loads in all cases for simplicity */

  .if dst_w_bpp == 8

-        conditional_process2  test_bits_1_0_ptr, mi, cs, process_head, process_tail, 1, 2, 1, 2, 1, 1, DECREMENT_X

+        conditional_process2  test_bits_1_0_ptr, mi, cs, \process_head, \process_tail, 1, 2, 1, 2, 1, 1, DECREMENT_X

  .elseif dst_w_bpp == 16

         test_bits_1_0_ptr

-        conditional_process1  cs, process_head, process_tail, 2, 2, 1, 1, DECREMENT_X

+        conditional_process1  cs, \process_head, \process_tail, 2, 2, 1, 1, DECREMENT_X

  .endif

-        conditional_process2  test_bits_3_2_ptr, mi, cs, process_head, process_tail, 4, 8, 1, 2, 1, 1, DECREMENT_X

+        conditional_process2  test_bits_3_2_ptr, mi, cs, \process_head, \process_tail, 4, 8, 1, 2, 1, 1, DECREMENT_X

  .if (flags) & FLAG_PROCESS_CORRUPTS_WK0

         ldr     X, [sp, #LINE_SAVED_REG_COUNT*4]

  .endif

 .endm

 .macro test_bits_3_2_pix

         movs    SCRATCH, X, lsl #dst_bpp_shift+32-3

 .endm

@@ -419,169 +451,169 @@ 100:

  .if dst_w_bpp == 8

         movs    SCRATCH, X, lsl #dst_bpp_shift+32-1

  .else

         movs    SCRATCH, X, lsr #1

  .endif

 .endm

 .macro trailing_15bytes  process_head, process_tail, unaligned_src, unaligned_mask

-        conditional_process2  test_bits_3_2_pix, cs, mi, process_head, process_tail, 8, 4, 0, 2, unaligned_src, unaligned_mask, 0

+        conditional_process2  test_bits_3_2_pix, cs, mi, \process_head, \process_tail, 8, 4, 0, 2, \unaligned_src, \unaligned_mask, 0

  .if dst_w_bpp == 16

         test_bits_1_0_pix

-        conditional_process1  cs, process_head, process_tail, 2, 0, unaligned_src, unaligned_mask, 0

+        conditional_process1  cs, \process_head, \process_tail, 2, 0, \unaligned_src, \unaligned_mask, 0

  .elseif dst_w_bpp == 8

-        conditional_process2  test_bits_1_0_pix, cs, mi, process_head, process_tail, 2, 1, 0, 1, unaligned_src, unaligned_mask, 0

+        conditional_process2  test_bits_1_0_pix, cs, mi, \process_head, \process_tail, 2, 1, 0, 1, \unaligned_src, \unaligned_mask, 0

  .endif

 .endm

 .macro wide_case_inner_loop  process_head, process_tail, unaligned_src, unaligned_mask, dst_alignment

 110:

  .set SUBBLOCK, 0 /* this is a count of STMs; there can be up to 8 STMs per block */

  .rept pix_per_block*dst_w_bpp/128

-        process_head  , 16, 0, unaligned_src, unaligned_mask, 1

+        \process_head  , 16, 0, \unaligned_src, \unaligned_mask, 1

   .if (src_bpp > 0) && (mask_bpp == 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)

         preload_middle  src_bpp, SRC, 1

   .elseif (src_bpp == 0) && (mask_bpp > 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)

         preload_middle  mask_bpp, MASK, 1

   .else

         preload_middle  src_bpp, SRC, 0

         preload_middle  mask_bpp, MASK, 0

   .endif

   .if (dst_r_bpp > 0) && ((SUBBLOCK % 2) == 0) && (((flags) & FLAG_NO_PRELOAD_DST) == 0)

         /* Because we know that writes are 16-byte aligned, it's relatively easy to ensure that

          * destination prefetches are 32-byte aligned. It's also the easiest channel to offset

          * preloads for, to achieve staggered prefetches for multiple channels, because there are

          * always two STMs per prefetch, so there is always an opposite STM on which to put the

          * preload. Note, no need to BIC the base register here */

-        PF  pld,    [DST, #32*prefetch_distance - dst_alignment]

+        PF  pld,    [DST, #32*prefetch_distance - \dst_alignment]

   .endif

-        process_tail  , 16, 0

+        \process_tail  , 16, 0

   .if !((flags) & FLAG_PROCESS_DOES_STORE)

         pixst   , 16, 0, DST

   .endif

   .set SUBBLOCK, SUBBLOCK+1

  .endr

         subs    X, X, #pix_per_block

         bhs     110b

 .endm

 .macro wide_case_inner_loop_and_trailing_pixels  process_head, process_tail, process_inner_loop, exit_label, unaligned_src, unaligned_mask

         /* Destination now 16-byte aligned; we have at least one block before we have to stop preloading */

  .if dst_r_bpp > 0

         tst     DST, #16

         bne     111f

-        process_inner_loop  process_head, process_tail, unaligned_src, unaligned_mask, 16 + DST_PRELOAD_BIAS

+        \process_inner_loop  \process_head, \process_tail, \unaligned_src, \unaligned_mask, 16 + DST_PRELOAD_BIAS

         b       112f

 111:

  .endif

-        process_inner_loop  process_head, process_tail, unaligned_src, unaligned_mask, 0 + DST_PRELOAD_BIAS

+        \process_inner_loop  \process_head, \process_tail, \unaligned_src, \unaligned_mask, 0 + DST_PRELOAD_BIAS

 112:

         /* Just before the final (prefetch_distance+1) 32-byte blocks, deal with final preloads */

  .if (src_bpp*pix_per_block > 256) || (mask_bpp*pix_per_block > 256) || (dst_r_bpp*pix_per_block > 256)

         PF  and,    WK0, X, #pix_per_block-1

  .endif

         preload_trailing  src_bpp, src_bpp_shift, SRC

         preload_trailing  mask_bpp, mask_bpp_shift, MASK

  .if ((flags) & FLAG_NO_PRELOAD_DST) == 0

         preload_trailing  dst_r_bpp, dst_bpp_shift, DST

  .endif

         add     X, X, #(prefetch_distance+2)*pix_per_block - 128/dst_w_bpp

         /* The remainder of the line is handled identically to the medium case */

-        medium_case_inner_loop_and_trailing_pixels  process_head, process_tail,, exit_label, unaligned_src, unaligned_mask

+        medium_case_inner_loop_and_trailing_pixels  \process_head, \process_tail,, \exit_label, \unaligned_src, \unaligned_mask

 .endm

 .macro medium_case_inner_loop_and_trailing_pixels  process_head, process_tail, unused, exit_label, unaligned_src, unaligned_mask

 120:

-        process_head  , 16, 0, unaligned_src, unaligned_mask, 0

-        process_tail  , 16, 0

+        \process_head  , 16, 0, \unaligned_src, \unaligned_mask, 0

+        \process_tail  , 16, 0

  .if !((flags) & FLAG_PROCESS_DOES_STORE)

         pixst   , 16, 0, DST

  .endif

         subs    X, X, #128/dst_w_bpp

         bhs     120b

         /* Trailing pixels */

         tst     X, #128/dst_w_bpp - 1

-        beq     exit_label

-        trailing_15bytes  process_head, process_tail, unaligned_src, unaligned_mask

+        beq     \exit_label

+        trailing_15bytes  \process_head, \process_tail, \unaligned_src, \unaligned_mask

 .endm

 .macro narrow_case_inner_loop_and_trailing_pixels  process_head, process_tail, unused, exit_label, unaligned_src, unaligned_mask

         tst     X, #16*8/dst_w_bpp

-        conditional_process1  ne, process_head, process_tail, 16, 0, unaligned_src, unaligned_mask, 0

+        conditional_process1  ne, \process_head, \process_tail, 16, 0, \unaligned_src, \unaligned_mask, 0

         /* Trailing pixels */

         /* In narrow case, it's relatively unlikely to be aligned, so let's do without a branch here */

-        trailing_15bytes  process_head, process_tail, unaligned_src, unaligned_mask

+        trailing_15bytes  \process_head, \process_tail, \unaligned_src, \unaligned_mask

 .endm

 .macro switch_on_alignment  action, process_head, process_tail, process_inner_loop, exit_label

  /* Note that if we're reading the destination, it's already guaranteed to be aligned at this point */

  .if mask_bpp == 8 || mask_bpp == 16

         tst     MASK, #3

         bne     141f

  .endif

   .if src_bpp == 8 || src_bpp == 16

         tst     SRC, #3

         bne     140f

   .endif

-        action  process_head, process_tail, process_inner_loop, exit_label, 0, 0

+        \action  \process_head, \process_tail, \process_inner_loop, \exit_label, 0, 0

   .if src_bpp == 8 || src_bpp == 16

-        b       exit_label

+        b       \exit_label

 140:

-        action  process_head, process_tail, process_inner_loop, exit_label, 1, 0

+        \action  \process_head, \process_tail, \process_inner_loop, \exit_label, 1, 0

   .endif

  .if mask_bpp == 8 || mask_bpp == 16

-        b       exit_label

+        b       \exit_label

 141:

   .if src_bpp == 8 || src_bpp == 16

         tst     SRC, #3

         bne     142f

   .endif

-        action  process_head, process_tail, process_inner_loop, exit_label, 0, 1

+        \action  \process_head, \process_tail, \process_inner_loop, \exit_label, 0, 1

   .if src_bpp == 8 || src_bpp == 16

-        b       exit_label

+        b       \exit_label

 142:

-        action  process_head, process_tail, process_inner_loop, exit_label, 1, 1

+        \action  \process_head, \process_tail, \process_inner_loop, \exit_label, 1, 1

   .endif

  .endif

 .endm

 .macro end_of_line      restore_x, vars_spilled, loop_label, last_one

- .if vars_spilled

+ .if \vars_spilled

         /* Sadly, GAS doesn't seem have an equivalent of the DCI directive? */

         /* This is ldmia sp,{} */

         .word   0xE89D0000 | LINE_SAVED_REGS

  .endif

         subs    Y, Y, #1

- .if vars_spilled

+ .if \vars_spilled

   .if (LINE_SAVED_REGS) & (1<<1)

         str     Y, [sp]

   .endif

  .endif

         add     DST, DST, STRIDE_D

  .if src_bpp > 0

         add     SRC, SRC, STRIDE_S

  .endif

  .if mask_bpp > 0

         add     MASK, MASK, STRIDE_M

  .endif

- .if restore_x

+ .if \restore_x

         mov     X, ORIG_W

  .endif

-        bhs     loop_label

- .ifc "last_one",""

-  .if vars_spilled

+        bhs     \loop_label

+ .ifc "\last_one",""

+  .if \vars_spilled

         b       197f

   .else

         b       198f

   .endif

  .else

-  .if (!vars_spilled) && ((flags) & FLAG_SPILL_LINE_VARS)

+  .if (!\vars_spilled) && ((flags) & FLAG_SPILL_LINE_VARS)

         b       198f

   .endif

  .endif

 .endm

 .macro generate_composite_function fname, \

                                    src_bpp_, \

@@ -591,27 +623,27 @@ 142:

                                    prefetch_distance_, \

                                    init, \

                                    newline, \

                                    cleanup, \

                                    process_head, \

                                    process_tail, \

                                    process_inner_loop

-    pixman_asm_function fname

+    pixman_asm_function \fname

/*

  * Make some macro arguments globally visible and accessible

  * from other macros

*/

- .set src_bpp, src_bpp_

- .set mask_bpp, mask_bpp_

- .set dst_w_bpp, dst_w_bpp_

- .set flags, flags_

- .set prefetch_distance, prefetch_distance_

+ .set src_bpp, \src_bpp_

+ .set mask_bpp, \mask_bpp_

+ .set dst_w_bpp, \dst_w_bpp_

+ .set flags, \flags_

+ .set prefetch_distance, \prefetch_distance_

/*

  * Select prefetch type for this function.

*/

  .if prefetch_distance == 0

   .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE

  .else

   .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_STANDARD

@@ -727,17 +759,17 @@ 142:

  .endif

 #ifdef DEBUG_PARAMS

         add     Y, Y, #1

         stmia   sp, {r0-r7,pc}

         sub     Y, Y, #1

 #endif

-        init

+        \init

  .if (flags) & FLAG_PROCESS_CORRUPTS_WK0

         /* Reserve a word in which to store X during leading pixels */

         sub     sp, sp, #4

   .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET+4

   .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET+4

  .endif

@@ -768,47 +800,47 @@ 142:

         mov     ORIG_W, X

   .if (flags) & FLAG_SPILL_LINE_VARS_WIDE

         /* This is stmdb sp!,{} */

         .word   0xE92D0000 | LINE_SAVED_REGS

    .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4

    .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4

   .endif

 151:    /* New line */

-        newline

+        \newline

         preload_leading_step1  src_bpp, WK1, SRC

         preload_leading_step1  mask_bpp, WK2, MASK

   .if ((flags) & FLAG_NO_PRELOAD_DST) == 0

         preload_leading_step1  dst_r_bpp, WK3, DST

   .endif

         ands    WK0, DST, #15

         beq     154f

         rsb     WK0, WK0, #16 /* number of leading bytes until destination aligned */

         preload_leading_step2  src_bpp, src_bpp_shift, WK1, SRC

         preload_leading_step2  mask_bpp, mask_bpp_shift, WK2, MASK

   .if ((flags) & FLAG_NO_PRELOAD_DST) == 0

         preload_leading_step2  dst_r_bpp, dst_bpp_shift, WK3, DST

   .endif

-        leading_15bytes  process_head, process_tail

+        leading_15bytes  \process_head, \process_tail

 154:    /* Destination now 16-byte aligned; we have at least one prefetch on each channel as well as at least one 16-byte output block */

   .if (src_bpp > 0) && (mask_bpp == 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)

         and     SCRATCH, SRC, #31

         rsb     SCRATCH, SCRATCH, #32*prefetch_distance

   .elseif (src_bpp == 0) && (mask_bpp > 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)

         and     SCRATCH, MASK, #31

         rsb     SCRATCH, SCRATCH, #32*prefetch_distance

   .endif

-  .ifc "process_inner_loop",""

-        switch_on_alignment  wide_case_inner_loop_and_trailing_pixels, process_head, process_tail, wide_case_inner_loop, 157f

+  .ifc "\process_inner_loop",""

+        switch_on_alignment  wide_case_inner_loop_and_trailing_pixels, \process_head, \process_tail, wide_case_inner_loop, 157f

   .else

-        switch_on_alignment  wide_case_inner_loop_and_trailing_pixels, process_head, process_tail, process_inner_loop, 157f

+        switch_on_alignment  wide_case_inner_loop_and_trailing_pixels, \process_head, \process_tail, \process_inner_loop, 157f

   .endif

 157:    /* Check for another line */

         end_of_line 1, %((flags) & FLAG_SPILL_LINE_VARS_WIDE), 151b

   .if (flags) & FLAG_SPILL_LINE_VARS_WIDE

    .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4

    .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4

   .endif

@@ -820,80 +852,80 @@ 160:    /* Medium case */

         mov     ORIG_W, X

  .if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE

         /* This is stmdb sp!,{} */

         .word   0xE92D0000 | LINE_SAVED_REGS

   .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4

   .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4

  .endif

 161:    /* New line */

-        newline

+        \newline

         preload_line 0, src_bpp, src_bpp_shift, SRC  /* in: X, corrupts: WK0-WK1 */

         preload_line 0, mask_bpp, mask_bpp_shift, MASK

  .if ((flags) & FLAG_NO_PRELOAD_DST) == 0

         preload_line 0, dst_r_bpp, dst_bpp_shift, DST

  .endif

         sub     X, X, #128/dst_w_bpp     /* simplifies inner loop termination */

         ands    WK0, DST, #15

         beq     164f

         rsb     WK0, WK0, #16 /* number of leading bytes until destination aligned */

-        leading_15bytes  process_head, process_tail

+        leading_15bytes  \process_head, \process_tail

 164:    /* Destination now 16-byte aligned; we have at least one 16-byte output block */

-        switch_on_alignment  medium_case_inner_loop_and_trailing_pixels, process_head, process_tail,, 167f

+        switch_on_alignment  medium_case_inner_loop_and_trailing_pixels, \process_head, \process_tail,, 167f

 167:    /* Check for another line */

         end_of_line 1, %((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE), 161b

  .ltorg

 170:    /* Narrow case, less than 31 bytes, so no guarantee of at least one 16-byte block */

  .if dst_w_bpp < 32

         mov     ORIG_W, X

  .endif

  .if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE

         /* This is stmdb sp!,{} */

         .word   0xE92D0000 | LINE_SAVED_REGS

  .endif

 171:    /* New line */

-        newline

+        \newline

         preload_line 1, src_bpp, src_bpp_shift, SRC  /* in: X, corrupts: WK0-WK1 */

         preload_line 1, mask_bpp, mask_bpp_shift, MASK

  .if ((flags) & FLAG_NO_PRELOAD_DST) == 0

         preload_line 1, dst_r_bpp, dst_bpp_shift, DST

  .endif

  .if dst_w_bpp == 8

         tst     DST, #3

         beq     174f

 172:    subs    X, X, #1

         blo     177f

-        process_head  , 1, 0, 1, 1, 0

-        process_tail  , 1, 0

+        \process_head  , 1, 0, 1, 1, 0

+        \process_tail  , 1, 0

   .if !((flags) & FLAG_PROCESS_DOES_STORE)

         pixst   , 1, 0, DST

   .endif

         tst     DST, #3

         bne     172b

  .elseif dst_w_bpp == 16

         tst     DST, #2

         beq     174f

         subs    X, X, #1

         blo     177f

-        process_head  , 2, 0, 1, 1, 0

-        process_tail  , 2, 0

+        \process_head  , 2, 0, 1, 1, 0

+        \process_tail  , 2, 0

   .if !((flags) & FLAG_PROCESS_DOES_STORE)

         pixst   , 2, 0, DST

   .endif

  .endif

 174:    /* Destination now 4-byte aligned; we have 0 or more output bytes to go */

-        switch_on_alignment  narrow_case_inner_loop_and_trailing_pixels, process_head, process_tail,, 177f

+        switch_on_alignment  narrow_case_inner_loop_and_trailing_pixels, \process_head, \process_tail,, 177f

 177:    /* Check for another line */

         end_of_line %(dst_w_bpp < 32), %((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE), 171b, last_one

  .if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE

   .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4

   .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4

  .endif

@@ -903,17 +935,17 @@ 197:

  .endif

 198:

  .if (flags) & FLAG_PROCESS_CORRUPTS_WK0

   .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET-4

   .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET-4

         add     sp, sp, #4

  .endif

-        cleanup

+        \cleanup

 #ifdef DEBUG_PARAMS

         add     sp, sp, #9*4 /* junk the debug copy of arguments */

 #endif

 199:

         pop     {r4-r11, pc}  /* exit */

  .ltorg

@@ -927,23 +959,23 @@ 199:

     .unreq  MASK

     .unreq  STRIDE_M

     .unreq  WK0

     .unreq  WK1

     .unreq  WK2

     .unreq  WK3

     .unreq  SCRATCH

     .unreq  ORIG_W

-    .endfunc

+    pixman_end_asm_function

 .endm

 .macro line_saved_regs  x:vararg

  .set LINE_SAVED_REGS, 0

  .set LINE_SAVED_REG_COUNT, 0

- .irp SAVED_REG,x

+ .irp SAVED_REG,\x

   .ifc "SAVED_REG","Y"

    .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<1)

    .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1

   .endif

   .ifc "SAVED_REG","STRIDE_D"

    .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<3)

    .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1

   .endif

Source code

Revision control

Copy as Markdown

Other Tools