/****************************************************************************
 * Twitch SDK
 *
 * This software is supplied under the terms of a license agreement with
 * Twitch Interactive, Inc. and may not be copied or used except in accordance
 * with the terms of that agreement
 *
 * Copyright (c) 2012-2016 Twitch Interactive, Inc.
 ***************************************************************************/

#pragma once

#include "twitchsdk/core/types/coretypes.h"

namespace ttv {
inline bool IsUtf8StartByte(char b) {
  // Check that the highest order bit is 1
  return (b & 0x80) == 0x80;
}

inline bool IsUtf8SequenceByte(char b) {
  // Check that the 2 highest order bits are 10
  return (b & 0xC0) == 0x80;
}

inline const char* AdvanceToNextUtf8Character(const char* cur, int& sequenceLength) {
  // http://en.wikipedia.org/wiki/UTF-8

  const char* next = cur;

  if (IsUtf8StartByte(*next)) {
    next++;

    while (IsUtf8SequenceByte(*next)) {
      next++;
    }
  } else {
    if ((*next) != '\0') {
      next++;
    }
  }

  sequenceLength = static_cast<int>(next - cur);

  return next;
}

inline int CountUtf8Bytes(const char* start, int numCharacters) {
  const char* cur = start;
  int sequenceLength = 0;

  for (int i = 0; i < numCharacters; ++i) {
    cur = AdvanceToNextUtf8Character(cur, sequenceLength);
  }

  return static_cast<int>(cur - start);
}

/**
 * This function validates if the input string is in valid UTF-8 format
 *
 * References:
 * - http://www.unicode.org/versions/Unicode10.0.0/ (Table 3.7)
 * - https://stackoverflow.com/questions/6555015/check-for-invalid-utf8
 * - https://en.wikipedia.org/wiki/UTF-8
 * - http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
 *
 * @param[in] str The string (of char* type) to validate
 * @param[out] hasFourByteChars Returns if the string has any four byte characters
 */
inline bool IsValidUtf8(const char* str, bool& hasFourByteChars) {
  /**
   * Valid UTF-8 Code Units:
   * 0b00000000-0b01111111 (0x00-0x7F): Single-byte code unit (ASCII)
   * 0b10000000-0b10111111 (0x80-0xBF): Continuation byte code unit
   * 0b11000010-0b11011111 (0xC2-0xDF): Two byte starting code unit
   * 0b11100000-0b11101111 (0xE0-0xEF): Three byte starting code unit
   *      Some continutation code units after 0xE0-0xEF are not valid
   *          For 0b11100000 (0xE0): Second byte must be 0b10100000-0b10111111 (0xA0-0xBF)
   *          For 0b11101101 (0xED): Second byte must be 0b10000000-0b10011111 (0x80-0x9F)
   * 0b11110000-0b11110100 (0xF0-0xF4): Four byte starting code unit
   *      Some continuation code units after 0xF0-0xF4 are not valid
   *          For 0b11110000 (0xF0): Second byte must be 0b10010000-0b10111111 (0x90-0xBF)
   *          For 0b11110100 (0xF4): Second byte must be 0b10000000-0b10001111 (0x80-0x8F)
   */

  /**
   * Invalid UTF-8 Code Units:
   * 0b11000000-0b11000001 (0xC0-0xC1)
   * 0b11110101-0b11111111 (0xF5-0xFF)
   */

  const char* c = str;
  bool expectStartByte = true;  // Whether we expect the next byte to be start of a new character / end of string

  uint8_t curNumBytes = 0;             // Number of bytes expected in the current character
  uint8_t curIndex = 0;                // Which byte we are at in the current character
  const char* curCharStartByte = str;  // The start byte for the current character
  hasFourByteChars = false;

  while (*c != '\0') {
    if (expectStartByte) {
      // Reset this to keep track of which byte we're at in the current character
      curIndex = 0;

      // Some validations requires us to know what the starting byte is
      curCharStartByte = c;

      // Now verify start byte and figure out how many bytes to expect
      if ((*c & 0b10000000) == 0b00000000)  // Check for 0xxxxxxx (0x00-0x7F)
      {
        // Single byte character - ASCII
        curNumBytes = 1;
        expectStartByte = true;  // The next byte will a part of a new character since this is a single byte character
      } else if ((*c & 0b11100000) == 0b11000000)  // Check for 110xxxxx (0xC0-0xDF)
      {
        if ((*c & 0b11111110) == 0b11000000)  // Check for 1100000x (0xC0-0xC1)
        {
          // 0b11000000 (0xC0) and 0b11000001 (0xC1) are invalid
          return false;
        }

        curNumBytes = 2;
        expectStartByte = false;
      } else if ((*c & 0b11110000) == 0b11100000)  // Check for 1110xxxx (0xE0-0xEF)
      {
        curNumBytes = 3;
        expectStartByte = false;
      } else if ((*c & 0b11111000) == 0b11110000)  // Check for 11110xxx (0xF0-0xF7)
      {
        if ((*c & 0b11111111) == 0b11110101)  // Check for 11110101 (0xF5)
        {
          // 0b11110101 (0xF5) is invalid
          return false;
        }

        if ((*c & 0b11111110) == 0b11110110)  // Check for 1111011x (0xF6-0xF7)
        {
          // 0b11110110 (0xF6) and 0b11110111 (0xF7) are invalid
          return false;
        }

        curNumBytes = 4;
        expectStartByte = false;

        // Set this true so that we can handle the conversion from UTF-8 to Java's modified UTF-8
        hasFourByteChars = true;
      } else {
        // The current byte is not a valid start byte
        return false;
      }
    } else {
      bool isValidContinuationCodeUnit = false;

      // Validate continuation code units - Special cases for second bytes are listed first
      if (curIndex == 1 && ((*curCharStartByte & 0b11111111) == 0b11100000))  // Check for 11100000 (0xE0)
      {
        // For 0b11100000 (0xE0) : Second byte must be 0b10100000 - 0b10111111 (0xA0 - 0xBF)
        if ((*c & 0b11100000) == 0b10100000)  // Check for 101xxxxx (0xA0-0xBF)
        {
          isValidContinuationCodeUnit = true;
        }
      } else if (curIndex == 1 && ((*curCharStartByte & 0b11111111) == 0b11101101))  // Check for 11100000 (0xED)
      {
        // For 0b11101101 (0xED) : Second byte must be 0b10000000 - 0b10011111 (0x80 - 0x9F)
        if ((*c & 0b11100000) == 0b10000000)  // Check for 100xxxxx (0x80-0x9F)
        {
          isValidContinuationCodeUnit = true;
        }
      } else if (curIndex == 1 && ((*curCharStartByte & 0b11111111) == 0b11110000))  // Check for 11110000 (0xF0)
      {
        // For 0b11110000 (0xF0) : Second byte must be 0b10010000 - 0b10111111 (0x90 - 0xBF)
        if (((*c & 0b11000000) == 0b10000000) &&
            ((*c & 0b11110000) !=
              0b10000000))  // Check for 10xxxxxx (0x80-0xBF) and check that c is not in 1000xxxx (0x80-0x8F)
        {
          isValidContinuationCodeUnit = true;
        }
      } else if (curIndex == 1 && ((*curCharStartByte & 0b11111111) == 0b11110100))  // Check for 11110100 (0xF4)
      {
        // For 0b11110100 (0xF4) : Second byte must be 0b10000000 - 0b10001111 (0x80 - 0x8F)
        if ((*c & 0b11110000) == 0b10000000)  // Check for 1000xxxx (0x80-0x8F)
        {
          isValidContinuationCodeUnit = true;
        }
      } else {
        // Validate that the continuation code unit is in range 0b10000000-0b10111111 (0x80-0xBF)
        if ((*c & 0b11000000) == 0b10000000)  // Check for 10xxxxxx (0x80-0xBF)
        {
          isValidContinuationCodeUnit = true;
        }
      }

      if (isValidContinuationCodeUnit) {
        // At the last byte of the current character
        if (curIndex == (curNumBytes - 1)) {
          expectStartByte = true;
        }
      } else {
        return false;
      }
    }

    curIndex++;
    c++;
  }

  // If we're still in a character, then string is invalid
  return expectStartByte;
}
}  // namespace ttv
