DXR is a code search and navigation tool aimed at making sense of large projects. It supports full-text and regex searches as well as structural queries.

Header

Mercurial (d8847129d134)

VCS Links

Line Code
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132
/*
 * Copyright 2011 Google Inc.
 *
 * Use of this source code is governed by a BSD-style license that can be
 * found in the LICENSE file.
 */

#include <emmintrin.h>
#include "SkBlitRect_opts_SSE2.h"
#include "SkBlitRow.h"
#include "SkColorPriv.h"

/* Simple blitting of opaque rectangles less than 31 pixels wide:
 * inlines and merges sections of Color32_SSE2 and sk_memset32_SSE2.
 */
static void BlitRect32_OpaqueNarrow_SSE2(SkPMColor* SK_RESTRICT destination,
                                  int width, int height,
                                  size_t rowBytes, uint32_t color) {
    SkASSERT(255 == SkGetPackedA32(color));
    SkASSERT(width > 0);
    SkASSERT(width < 31);

    while (--height >= 0) {
        SkPMColor* dst = destination;
        int count = width;

        while (count > 4) {
            *dst++ = color;
            *dst++ = color;
            *dst++ = color;
            *dst++ = color;
            count -= 4;
        }

        while (count > 0) {
            *dst++ = color;
            --count;
        }

        destination = (uint32_t*)((char*)destination + rowBytes);
    }
}

/*
 * Fast blitting of opaque rectangles at least 31 pixels wide:
 * inlines and merges sections of Color32_SSE2 and sk_memset32_SSE2.
 * A 31 pixel rectangle is guaranteed to have at least one
 * 16-pixel aligned span that can take advantage of mm_store.
 */
static void BlitRect32_OpaqueWide_SSE2(SkPMColor* SK_RESTRICT destination,
                                int width, int height,
                                size_t rowBytes, uint32_t color) {
    SkASSERT(255 == SkGetPackedA32(color));
    SkASSERT(width >= 31);

    __m128i color_wide = _mm_set1_epi32(color);
    while (--height >= 0) {
        // Prefetching one row ahead to L1 cache can equal hardware
        // performance for large/tall rects, but never *beats*
        // hardware performance.
        SkPMColor* dst = destination;
        int count = width;

        while (((size_t)dst) & 0x0F) {
            *dst++ = color;
            --count;
        }
        __m128i *d = reinterpret_cast<__m128i*>(dst);

        // Googling suggests _mm_stream is only going to beat _mm_store
        // for things that wouldn't fit in L2 cache anyway, typically
        // >500kB, and precisely fill cache lines.  For us, with
        // arrays > 100k elements _mm_stream is still 100%+ slower than
        // mm_store.

        // Unrolling to count >= 64 is a break-even for most
        // input patterns; we seem to be saturating the bus and having
        // low enough overhead at 32.

        while (count >= 32) {
            _mm_store_si128(d++, color_wide);
            _mm_store_si128(d++, color_wide);
            _mm_store_si128(d++, color_wide);
            _mm_store_si128(d++, color_wide);
            _mm_store_si128(d++, color_wide);
            _mm_store_si128(d++, color_wide);
            _mm_store_si128(d++, color_wide);
            _mm_store_si128(d++, color_wide);
            count -= 32;
        }
        if (count >= 16) {
            _mm_store_si128(d++, color_wide);
            _mm_store_si128(d++, color_wide);
            _mm_store_si128(d++, color_wide);
            _mm_store_si128(d++, color_wide);
            count -= 16;
        }
        dst = reinterpret_cast<uint32_t*>(d);

        // Unrolling the loop in the Narrow code is a significant performance
        // gain, but unrolling this loop appears to make no difference in
        // benchmarks with either mm_store_si128 or individual sets.

        while (count > 0) {
            *dst++ = color;
            --count;
        }

        destination = (uint32_t*)((char*)destination + rowBytes);
    }
}

void ColorRect32_SSE2(SkPMColor* destination,
                      int width, int height,
                      size_t rowBytes, uint32_t color) {
    if (0 == height || 0 == width || 0 == color) {
        return;
    }
    unsigned colorA = SkGetPackedA32(color);
    colorA = 0; // skip below if () for now...(has been disabled since this was added in r3423).
    if (255 == colorA) {
        if (width < 31) {
            BlitRect32_OpaqueNarrow_SSE2(destination, width, height,
                                         rowBytes, color);
        } else {
            BlitRect32_OpaqueWide_SSE2(destination, width, height,
                                       rowBytes, color);
        }
    } else {
        SkBlitRow::ColorRect32(destination, width, height, rowBytes, color);
    }
}