DXR is a code search and navigation tool aimed at making sense of large projects. It supports full-text and regex searches as well as structural queries.

Header

Mercurial (d8847129d134)

VCS Links

Line Code
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188
/*
 * Copyright 2013 The Android Open Source Project
 *
 * Use of this source code is governed by a BSD-style license that can be
 * found in the LICENSE file.
 */


#include "SkBitmap.h"
#include "SkColorPriv.h"
#include "SkBlurImage_opts.h"
#include "SkRect.h"

#include <arm_neon.h>

namespace {

enum BlurDirection {
    kX, kY
};

/**
 * Helper function to load 2 pixels from diffent rows to a 8x8 NEON register
 * and also pre-load pixels for future read
 */
template<BlurDirection srcDirection>
inline uint8x8_t load_2_pixels(const SkPMColor* src, int srcStride) {
    if (srcDirection == kX) {
        uint32x2_t temp = vdup_n_u32(0);
        // 10% faster by adding these 2 prefetches
        SK_PREFETCH(src + 16);
        SK_PREFETCH(src + srcStride + 16);
        return vreinterpret_u8_u32(vld1_lane_u32(src + srcStride, vld1_lane_u32(src, temp, 0), 1));
     } else {
         return vld1_u8((uint8_t*)src);
     }
}

/**
 * Helper function to store the low 8-bits from a 16x8 NEON register to 2 rows
 */
template<BlurDirection dstDirection>
inline void store_2_pixels(uint16x8_t result16x8, SkPMColor* dst, int dstStride) {
    if (dstDirection == kX) {
        uint32x2_t temp = vreinterpret_u32_u8(vmovn_u16(result16x8));
        vst1_lane_u32(dst, temp, 0);
        vst1_lane_u32(dst + dstStride, temp, 1);
    } else {
        uint8x8_t temp = vmovn_u16(result16x8);
        vst1_u8((uint8_t*)dst, temp);
    }
}

/**
 * fast path for kernel size less than 128
 */
template<BlurDirection srcDirection, BlurDirection dstDirection>
void SkDoubleRowBoxBlur_NEON(const SkPMColor** src, int srcStride, SkPMColor** dst, int kernelSize,
                        int leftOffset, int rightOffset, int width, int* height)
{
    const int rightBorder = SkMin32(rightOffset + 1, width);
    const int srcStrideX = srcDirection == kX ? 1 : srcStride;
    const int dstStrideX = dstDirection == kX ? 1 : *height;
    const int srcStrideY = srcDirection == kX ? srcStride : 1;
    const int dstStrideY = dstDirection == kX ? width : 1;
    const uint16x8_t scale = vdupq_n_u16((1 << 15) / kernelSize);

    for (; *height >= 2; *height -= 2) {
        uint16x8_t sum = vdupq_n_u16(0);
        const SkPMColor* p = *src;
        for (int i = 0; i < rightBorder; i++) {
            sum = vaddw_u8(sum,
                load_2_pixels<srcDirection>(p, srcStride));
            p += srcStrideX;
        }

        const SkPMColor* sptr = *src;
        SkPMColor* dptr = *dst;
        for (int x = 0; x < width; x++) {
            // val = (sum * scale * 2 + 0x8000) >> 16
            uint16x8_t resultPixels = vreinterpretq_u16_s16(vqrdmulhq_s16(
                vreinterpretq_s16_u16(sum), vreinterpretq_s16_u16(scale)));
            store_2_pixels<dstDirection>(resultPixels, dptr, width);

            if (x >= leftOffset) {
                sum = vsubw_u8(sum,
                    load_2_pixels<srcDirection>(sptr - leftOffset * srcStrideX, srcStride));
            }
            if (x + rightOffset + 1 < width) {
                sum = vaddw_u8(sum,
                    load_2_pixels<srcDirection>(sptr + (rightOffset + 1) * srcStrideX, srcStride));
            }
            sptr += srcStrideX;
            dptr += dstStrideX;
        }
        *src += srcStrideY * 2;
        *dst += dstStrideY * 2;
    }
}


/**
 * Helper function to spread the components of a 32-bit integer into the
 * lower 8 bits of each 16-bit element of a NEON register.
 */

static inline uint16x4_t expand(uint32_t a) {
    // ( ARGB ) -> ( ARGB ARGB ) -> ( A R G B A R G B )
    uint8x8_t v8 = vreinterpret_u8_u32(vdup_n_u32(a));
    // ( A R G B A R G B ) -> ( 0A 0R 0G 0B 0A 0R 0G 0B ) -> ( 0A 0R 0G 0B )
    return vget_low_u16(vmovl_u8(v8));
}

template<BlurDirection srcDirection, BlurDirection dstDirection>
void SkBoxBlur_NEON(const SkPMColor* src, int srcStride, SkPMColor* dst, int kernelSize,
                    int leftOffset, int rightOffset, int width, int height)
{
    const int rightBorder = SkMin32(rightOffset + 1, width);
    const int srcStrideX = srcDirection == kX ? 1 : srcStride;
    const int dstStrideX = dstDirection == kX ? 1 : height;
    const int srcStrideY = srcDirection == kX ? srcStride : 1;
    const int dstStrideY = dstDirection == kX ? width : 1;
    const uint32x4_t scale = vdupq_n_u32((1 << 24) / kernelSize);
    const uint32x4_t half = vdupq_n_u32(1 << 23);

    if (kernelSize < 128)
    {
        SkDoubleRowBoxBlur_NEON<srcDirection, dstDirection>(&src, srcStride, &dst, kernelSize,
            leftOffset, rightOffset, width, &height);
    }

    for (; height > 0; height--) {
        uint32x4_t sum = vdupq_n_u32(0);
        const SkPMColor* p = src;
        for (int i = 0; i < rightBorder; ++i) {
            sum = vaddw_u16(sum, expand(*p));
            p += srcStrideX;
        }

        const SkPMColor* sptr = src;
        SkPMColor* dptr = dst;
        for (int x = 0; x < width; ++x) {
            // ( half+sumA*scale half+sumR*scale half+sumG*scale half+sumB*scale )
            uint32x4_t result = vmlaq_u32(half, sum, scale);

            // Saturated conversion to 16-bit.
            // ( AAAA RRRR GGGG BBBB ) -> ( 0A 0R 0G 0B )
            uint16x4_t result16 = vqshrn_n_u32(result, 16);

            // Saturated conversion to 8-bit.
            // ( 0A 0R 0G 0B ) -> ( 0A 0R 0G 0B 0A 0R 0G 0B ) -> ( A R G B A R G B )
            uint8x8_t result8 = vqshrn_n_u16(vcombine_u16(result16, result16), 8);

            // ( A R G B A R G B ) -> ( ARGB ARGB ) -> ( ARGB )
            // Store low 32 bits to destination.
            vst1_lane_u32(dptr, vreinterpret_u32_u8(result8), 0);

            if (x >= leftOffset) {
                const SkPMColor* l = sptr - leftOffset * srcStrideX;
                sum = vsubw_u16(sum, expand(*l));
            }
            if (x + rightOffset + 1 < width) {
                const SkPMColor* r = sptr + (rightOffset + 1) * srcStrideX;
                sum = vaddw_u16(sum, expand(*r));
            }
            sptr += srcStrideX;
            if (srcDirection == kX) {
                SK_PREFETCH(sptr + (rightOffset + 16) * srcStrideX);
            }
            dptr += dstStrideX;
        }
        src += srcStrideY;
        dst += dstStrideY;
    }
}

} // namespace

bool SkBoxBlurGetPlatformProcs_NEON(SkBoxBlurProc* boxBlurX,
                                    SkBoxBlurProc* boxBlurY,
                                    SkBoxBlurProc* boxBlurXY,
                                    SkBoxBlurProc* boxBlurYX) {
    *boxBlurX = SkBoxBlur_NEON<kX, kX>;
    *boxBlurY = SkBoxBlur_NEON<kY, kY>;
    *boxBlurXY = SkBoxBlur_NEON<kX, kY>;
    *boxBlurYX = SkBoxBlur_NEON<kY, kX>;
    return true;
}