/****************************************************************************
 * Twitch SDK
 *
 * This software is supplied under the terms of a license agreement with
 * Twitch Interactive, Inc. and may not be copied or used except in accordance
 * with the terms of that agreement
 *
 * Copyright (c) 2012-2016 Twitch Interactive, Inc.
 ***************************************************************************/

#include "twitchsdk/broadcast/internal/pch.h"

#include "twitchsdk/broadcast/internal/rgbyuv.h"

#ifdef WIN32
#include "twitchsdk/broadcast/internal/rgbyuv_sse.h"

#include <intrin.h>

#define RGBYUV_SSE_ENABLED 1
#endif

#include "twitchsdk/core/systemclock.h"

namespace {
using namespace ttv::broadcast;

//--------------------------------------------------------------------------------------------------
// RGB to YUV420P component conversion functions
//
inline uint8_t BGRtoY(const uint8_t* bgraPixel, const int* factors) {
  return static_cast<uint8_t>((((factors[0] * bgraPixel[0] + factors[1] * bgraPixel[1] + factors[2] * bgraPixel[2] +
                                  factors[3] * bgraPixel[3] + 128) >>
                                 8) +
                               16));
}

inline uint8_t BGRtoU(const uint8_t* bgraPixel, const int* factors) {
  return static_cast<uint8_t>((((factors[0] * bgraPixel[0] + factors[1] * bgraPixel[1] + factors[2] * bgraPixel[2] +
                                  factors[3] * bgraPixel[3] + 128) >>
                                 8) +
                               128));
}

inline uint8_t BGRtoV(const uint8_t* bgraPixel, const int* factors) {
  return static_cast<uint8_t>((((factors[0] * bgraPixel[0] + factors[1] * bgraPixel[1] + factors[2] * bgraPixel[2] +
                                  factors[3] * bgraPixel[3] + 128) >>
                                 8) +
                               128));
}

//--------------------------------------------------------------------------------------------------
void RGBtoYUV_Slow(const uint8_t* rgbBuffer, uint32_t bgraMask, uint width, uint height, uint8_t* YBuffer,
  uint8_t* UVBuffer, YUVFormat yuvFormat, bool verticalFlip) {
  int BGRA_Y[4] = {25, 129, 66, 0};
  int BGRA_U[4] = {112, -74, -38, 0};
  int BGRA_V[4] = {-18, -94, 112, 0};

  uint8_t* factorsMask = reinterpret_cast<uint8_t*>(&bgraMask);

  int YFactors[4];
  int UFactors[4];
  int VFactors[4];
  for (int i = 0; i < 4; ++i) {
    YFactors[i] = BGRA_Y[factorsMask[3 - i]];
    UFactors[i] = BGRA_U[factorsMask[3 - i]];
    VFactors[i] = BGRA_V[factorsMask[3 - i]];
  }

  size_t yCount = 0;
  size_t uvCount = 0;

  // In the case of formats where the UV values are not packed, find the offset at which the
  // second buffer starts
  uint secondBufferOffset = 0;
  if (yuvFormat == YUVFormat::TTV_YUV_I420 || yuvFormat == YUVFormat::TTV_YUV_YV12) {
    TTV_ASSERT(width % 2 == 0 && height % 2 == 0);
    secondBufferOffset = width * height / 4;
  }

  uint startHeight = 0;
  uint endHeight = height;
  int heightIncrement = 1;
  int nextRowOffset = static_cast<int>(width) * 4;
  if (verticalFlip) {
    startHeight = height - 1;
    endHeight = 0;
    heightIncrement = -1;
    nextRowOffset = -1 * static_cast<int>(width) * 4;
  }

  for (uint i = startHeight; i != endHeight; i = static_cast<uint>(static_cast<int>(i) + heightIncrement)) {
    for (uint j = 0; j < width; j += 2) {
      const uint8_t* bgraPixel1 = rgbBuffer + ((i * width + j) * 4);
      const uint8_t* bgraPixel3 = bgraPixel1 + nextRowOffset;
      const uint8_t* bgraPixel2 = bgraPixel1 + 4;
      const uint8_t* bgraPixel4 = bgraPixel3 + 4;

      YBuffer[yCount++] = static_cast<uint8_t>(BGRtoY(bgraPixel1, YFactors));
      YBuffer[yCount++] = static_cast<uint8_t>(BGRtoY(bgraPixel2, YFactors));

      if (i % 2 == 0 && j % 2 == 0) {
        uint8_t uVal = static_cast<uint8_t>((BGRtoU(bgraPixel1, UFactors) + BGRtoU(bgraPixel2, UFactors) +
                                              BGRtoU(bgraPixel3, UFactors) + BGRtoU(bgraPixel4, UFactors)) /
                                            4);
        uint8_t vVal = static_cast<uint8_t>((BGRtoV(bgraPixel1, VFactors) + BGRtoV(bgraPixel2, VFactors) +
                                              BGRtoV(bgraPixel3, VFactors) + BGRtoV(bgraPixel4, VFactors)) /
                                            4);
        switch (yuvFormat) {
          case YUVFormat::TTV_YUV_I420:
          case YUVFormat::TTV_YUV_YV12: {
            uint8_t firstVal = (yuvFormat == YUVFormat::TTV_YUV_I420) ? uVal : vVal;
            uint8_t secondVal = (yuvFormat == YUVFormat::TTV_YUV_I420) ? vVal : uVal;
            UVBuffer[uvCount] = firstVal;
            UVBuffer[secondBufferOffset + uvCount] = secondVal;
            ++uvCount;
          } break;
          case YUVFormat::TTV_YUV_NV12:
            UVBuffer[uvCount++] = uVal;
            UVBuffer[uvCount++] = vVal;
            break;
          default:
            TTV_ASSERT(false);
            break;
        }
      }
    }
  }
}
}  // namespace

//--------------------------------------------------------------------------------------------------
void ttv::broadcast::RGBtoYUV(const uint8_t* rgbBuffer, uint32_t bgraMask, uint width, uint height, uint8_t* YBuffer,
  uint8_t* UVBuffer, YUVFormat yuvFormat, bool verticalFlip /*=false*/) {
#ifdef WIN32
  bool doSSEConversion = false;
  if (width % 16 == 0) {
    // Check if the CPU supports SSSE3 (Supplemental SSE3). We use the _mm_shuffle_epi8 intrinsic which
    // requires SSSE3 support. TODO: See if we can use other intrinsics that don't require SSSE3.
    //
    int cpuInfo[4];
    __cpuid(cpuInfo, 1);
    bool ssse3Enabled = (cpuInfo[2] & 0x200) || false;

    if (ssse3Enabled) {
      doSSEConversion = true;
    }
  }

  if (doSSEConversion) {
    RGBtoYUV_SSE(rgbBuffer, bgraMask, width, height, YBuffer, UVBuffer, yuvFormat, verticalFlip);
  } else
#endif
  {
    RGBtoYUV_Slow(rgbBuffer, bgraMask, width, height, YBuffer, UVBuffer, yuvFormat, verticalFlip);
  }
}
