yuv_row_win64.cpp - mozsearch

mozilla-central/gfx/ycbcr/yuv_row_win64.cpp (file symbol)

Enable keyboard shortcuts

Source code

File a bug in Core :: Graphics

Revision control

Copy as Markdown

Other Tools

// Copyright (c) 2010 The Chromium Authors. All rights reserved.

// Use of this source code is governed by a BSD-style license that can be

// found in the LICENSE file.

#include "yuv_row.h"

extern "C" {

// x64 compiler doesn't support MMX and inline assembler.  Use SSE2 intrinsics.

#define kCoefficientsRgbU (reinterpret_cast<const uint8_t*>(kCoefficientsRgbY) + 2048)

#define kCoefficientsRgbV (reinterpret_cast<const uint8_t*>(kCoefficientsRgbY) + 4096)

#include <emmintrin.h>

static void FastConvertYUVToRGB32Row_SSE2(const uint8_t* y_buf,

                                          const uint8_t* u_buf,

                                          const uint8_t* v_buf,

                                          uint8_t* rgb_buf,

                                          int width) {

  __m128i xmm0, xmmY1, xmmY2;

  __m128  xmmY;

  while (width >= 2) {

    xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbU + 8 * *u_buf++)),

                          _mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbV + 8 * *v_buf++)));

    xmmY1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(reinterpret_cast<const uint8_t*>(kCoefficientsRgbY) + 8 * *y_buf++));

    xmmY1 = _mm_adds_epi16(xmmY1, xmm0);

    xmmY2 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(reinterpret_cast<const uint8_t*>(kCoefficientsRgbY) + 8 * *y_buf++));

    xmmY2 = _mm_adds_epi16(xmmY2, xmm0);

    xmmY = _mm_shuffle_ps(_mm_castsi128_ps(xmmY1), _mm_castsi128_ps(xmmY2),

                          0x44);

    xmmY1 = _mm_srai_epi16(_mm_castps_si128(xmmY), 6);

    xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);

    _mm_storel_epi64(reinterpret_cast<__m128i*>(rgb_buf), xmmY1);

    rgb_buf += 8;

    width -= 2;

  if (width) {

    xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbU + 8 * *u_buf)),

                          _mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbV + 8 * *v_buf)));

    xmmY1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(reinterpret_cast<const uint8_t*>(kCoefficientsRgbY) + 8 * *y_buf));

    xmmY1 = _mm_adds_epi16(xmmY1, xmm0);

    xmmY1 = _mm_srai_epi16(xmmY1, 6);

    xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);

    *reinterpret_cast<uint32_t*>(rgb_buf) = _mm_cvtsi128_si32(xmmY1);

static void ScaleYUVToRGB32Row_SSE2(const uint8_t* y_buf,

                                    const uint8_t* u_buf,

                                    const uint8_t* v_buf,

                                    uint8_t* rgb_buf,

                                    int width,

                                    int source_dx) {

  __m128i xmm0, xmmY1, xmmY2;

  __m128  xmmY;

  uint8_t u, v, y;

  int x = 0;

  while (width >= 2) {

    u = u_buf[x >> 17];

    v = v_buf[x >> 17];

    y = y_buf[x >> 16];

    x += source_dx;

    xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbU + 8 * u)),

                          _mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbV + 8 * v)));

    xmmY1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(reinterpret_cast<const uint8_t*>(kCoefficientsRgbY) + 8 * y));

    xmmY1 = _mm_adds_epi16(xmmY1, xmm0);

    y = y_buf[x >> 16];

    x += source_dx;

    xmmY2 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(reinterpret_cast<const uint8_t*>(kCoefficientsRgbY) + 8 * y));

    xmmY2 = _mm_adds_epi16(xmmY2, xmm0);

    xmmY = _mm_shuffle_ps(_mm_castsi128_ps(xmmY1), _mm_castsi128_ps(xmmY2),

                          0x44);

    xmmY1 = _mm_srai_epi16(_mm_castps_si128(xmmY), 6);

    xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);

    _mm_storel_epi64(reinterpret_cast<__m128i*>(rgb_buf), xmmY1);

    rgb_buf += 8;

    width -= 2;

  if (width) {

    u = u_buf[x >> 17];

    v = v_buf[x >> 17];

    y = y_buf[x >> 16];

    xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbU + 8 * u)),

                          _mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbV + 8 * v)));

    xmmY1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(reinterpret_cast<const uint8_t*>(kCoefficientsRgbY) + 8 * y));

    xmmY1 = _mm_adds_epi16(xmmY1, xmm0);

    xmmY1 = _mm_srai_epi16(xmmY1, 6);

    xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);

    *reinterpret_cast<uint32_t*>(rgb_buf) = _mm_cvtsi128_si32(xmmY1);

static void LinearScaleYUVToRGB32Row_SSE2(const uint8_t* y_buf,

                                          const uint8_t* u_buf,

                                          const uint8_t* v_buf,

                                          uint8_t* rgb_buf,

                                          int width,

                                          int source_dx) {

  __m128i xmm0, xmmY1, xmmY2;

  __m128  xmmY;

  uint8_t u0, u1, v0, v1, y0, y1;

  uint32_t uv_frac, y_frac, u, v, y;

  int x = 0;

  if (source_dx >= 0x20000) {

    x = 32768;

  while(width >= 2) {

    u0 = u_buf[x >> 17];

    u1 = u_buf[(x >> 17) + 1];

    v0 = v_buf[x >> 17];

    v1 = v_buf[(x >> 17) + 1];

    y0 = y_buf[x >> 16];

    y1 = y_buf[(x >> 16) + 1];

    uv_frac = (x & 0x1fffe);

    y_frac = (x & 0xffff);

    u = (uv_frac * u1 + (uv_frac ^ 0x1fffe) * u0) >> 17;

    v = (uv_frac * v1 + (uv_frac ^ 0x1fffe) * v0) >> 17;

    y = (y_frac * y1 + (y_frac ^ 0xffff) * y0) >> 16;

    x += source_dx;

    xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbU + 8 * u)),

                          _mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbV + 8 * v)));

    xmmY1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(reinterpret_cast<const uint8_t*>(kCoefficientsRgbY) + 8 * y));

    xmmY1 = _mm_adds_epi16(xmmY1, xmm0);

    y0 = y_buf[x >> 16];

    y1 = y_buf[(x >> 16) + 1];

    y_frac = (x & 0xffff);

    y = (y_frac * y1 + (y_frac ^ 0xffff) * y0) >> 16;

    x += source_dx;

    xmmY2 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(reinterpret_cast<const uint8_t*>(kCoefficientsRgbY) + 8 * y));

    xmmY2 = _mm_adds_epi16(xmmY2, xmm0);

    xmmY = _mm_shuffle_ps(_mm_castsi128_ps(xmmY1), _mm_castsi128_ps(xmmY2),

                          0x44);

    xmmY1 = _mm_srai_epi16(_mm_castps_si128(xmmY), 6);

    xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);

    _mm_storel_epi64(reinterpret_cast<__m128i*>(rgb_buf), xmmY1);

    rgb_buf += 8;

    width -= 2;

  if (width) {

    u = u_buf[x >> 17];

    v = v_buf[x >> 17];

    y = y_buf[x >> 16];

    xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbU + 8 * u)),

                          _mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbV + 8 * v)));

    xmmY1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(reinterpret_cast<const uint8_t*>(kCoefficientsRgbY) + 8 * y));

    xmmY1 = _mm_adds_epi16(xmmY1, xmm0);

    xmmY1 = _mm_srai_epi16(xmmY1, 6);

    xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);

    *reinterpret_cast<uint32_t*>(rgb_buf) = _mm_cvtsi128_si32(xmmY1);

void FastConvertYUVToRGB32Row(const uint8_t* y_buf,

                              const uint8_t* u_buf,

                              const uint8_t* v_buf,

                              uint8_t* rgb_buf,

                              int width) {

  FastConvertYUVToRGB32Row_SSE2(y_buf, u_buf, v_buf, rgb_buf, width);

void ScaleYUVToRGB32Row(const uint8_t* y_buf,

                        const uint8_t* u_buf,

                        const uint8_t* v_buf,

                        uint8_t* rgb_buf,

                        int width,

                        int source_dx) {

  ScaleYUVToRGB32Row_SSE2(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);

void LinearScaleYUVToRGB32Row(const uint8_t* y_buf,

                              const uint8_t* u_buf,

                              const uint8_t* v_buf,

                              uint8_t* rgb_buf,

                              int width,

                              int source_dx) {

  LinearScaleYUVToRGB32Row_SSE2(y_buf, u_buf, v_buf, rgb_buf, width,

                                source_dx);

} // extern "C"