Twitch SDK (Internal)
utf8.h
Go to the documentation of this file.
1 /********************************************************************************************
2 * Twitch Broadcasting SDK
3 *
4 * This software is supplied under the terms of a license agreement with Twitch Interactive, Inc. and
5 * may not be copied or used except in accordance with the terms of that agreement
6 * Copyright (c) 2012-2016 Twitch Interactive, Inc.
7 *********************************************************************************************/
8 
9 #pragma once
10 
12 
13 namespace ttv
14 {
15  inline bool IsUtf8StartByte(char b)
16  {
17  // Check that the highest order bit is 1
18  return (b & 0x80) == 0x80;
19  }
20 
21 
22  inline bool IsUtf8SequenceByte(char b)
23  {
24  // Check that the 2 highest order bits are 10
25  return (b & 0xC0) == 0x80;
26  }
27 
28  inline const char* AdvanceToNextUtf8Character(const char* cur, int& sequenceLength)
29  {
30  // http://en.wikipedia.org/wiki/UTF-8
31 
32  const char* next = cur;
33 
34  if (IsUtf8StartByte(*next))
35  {
36  next++;
37 
38  while (IsUtf8SequenceByte(*next))
39  {
40  next++;
41  }
42  }
43  else
44  {
45  if ((*next) != '\0')
46  {
47  next++;
48  }
49  }
50 
51  sequenceLength = static_cast<int>(next - cur);
52 
53  return next;
54  }
55 
56  inline int CountUtf8Bytes(const char* start, int numCharacters)
57  {
58  const char* cur = start;
59  int sequenceLength = 0;
60 
61  for (int i = 0; i < numCharacters; ++i)
62  {
63  cur = AdvanceToNextUtf8Character(cur, sequenceLength);
64  }
65 
66  return static_cast<int>(cur - start);
67  }
68 
69 
82  inline bool IsValidUtf8(const char* str, bool& hasFourByteChars)
83  {
105  const char* c = str;
106  bool expectStartByte = true; // Whether we expect the next byte to be start of a new character / end of string
107 
108  uint8_t curNumBytes = 0; // Number of bytes expected in the current character
109  uint8_t curIndex = 0; // Which byte we are at in the current character
110  const char* curCharStartByte = str; // The start byte for the current character
111  hasFourByteChars = false;
112 
113  while (*c != '\0')
114  {
115  if (expectStartByte)
116  {
117  // Reset this to keep track of which byte we're at in the current character
118  curIndex = 0;
119 
120  // Some validations requires us to know what the starting byte is
121  curCharStartByte = c;
122 
123  // Now verify start byte and figure out how many bytes to expect
124  if ((*c & 0b10000000) == 0b00000000) // Check for 0xxxxxxx (0x00-0x7F)
125  {
126  // Single byte character - ASCII
127  curNumBytes = 1;
128  expectStartByte = true; // The next byte will a part of a new character since this is a single byte character
129  }
130  else if ((*c & 0b11100000) == 0b11000000) // Check for 110xxxxx (0xC0-0xDF)
131  {
132  if ((*c & 0b11111110) == 0b11000000) // Check for 1100000x (0xC0-0xC1)
133  {
134  // 0b11000000 (0xC0) and 0b11000001 (0xC1) are invalid
135  return false;
136  }
137 
138  curNumBytes = 2;
139  expectStartByte = false;
140  }
141  else if ((*c & 0b11110000) == 0b11100000) // Check for 1110xxxx (0xE0-0xEF)
142  {
143  curNumBytes = 3;
144  expectStartByte = false;
145  }
146  else if ((*c & 0b11111000) == 0b11110000) // Check for 11110xxx (0xF0-0xF7)
147  {
148  if ((*c & 0b11111111) == 0b11110101) // Check for 11110101 (0xF5)
149  {
150  // 0b11110101 (0xF5) is invalid
151  return false;
152  }
153 
154  if ((*c & 0b11111110) == 0b11110110) // Check for 1111011x (0xF6-0xF7)
155  {
156  // 0b11110110 (0xF6) and 0b11110111 (0xF7) are invalid
157  return false;
158  }
159 
160  curNumBytes = 4;
161  expectStartByte = false;
162 
163  // Set this true so that we can handle the conversion from UTF-8 to Java's modified UTF-8
164  hasFourByteChars = true;
165  }
166  else
167  {
168  // The current byte is not a valid start byte
169  return false;
170  }
171  }
172  else
173  {
174  bool isValidContinuationCodeUnit = false;
175 
176  // Validate continuation code units - Special cases for second bytes are listed first
177  if (curIndex == 1 && ((*curCharStartByte & 0b11111111) == 0b11100000)) // Check for 11100000 (0xE0)
178  {
179  // For 0b11100000 (0xE0) : Second byte must be 0b10100000 - 0b10111111 (0xA0 - 0xBF)
180  if ((*c & 0b11100000) == 0b10100000) // Check for 101xxxxx (0xA0-0xBF)
181  {
182  isValidContinuationCodeUnit = true;
183  }
184  }
185  else if (curIndex == 1 && ((*curCharStartByte & 0b11111111) == 0b11101101)) // Check for 11100000 (0xED)
186  {
187  // For 0b11101101 (0xED) : Second byte must be 0b10000000 - 0b10011111 (0x80 - 0x9F)
188  if ((*c & 0b11100000) == 0b10000000) // Check for 100xxxxx (0x80-0x9F)
189  {
190  isValidContinuationCodeUnit = true;
191  }
192  }
193  else if (curIndex == 1 && ((*curCharStartByte & 0b11111111) == 0b11110000)) // Check for 11110000 (0xF0)
194  {
195  // For 0b11110000 (0xF0) : Second byte must be 0b10010000 - 0b10111111 (0x90 - 0xBF)
196  if (((*c & 0b11000000) == 0b10000000) && ((*c & 0b11110000) != 0b10000000)) // Check for 10xxxxxx (0x80-0xBF) and check that c is not in 1000xxxx (0x80-0x8F)
197  {
198  isValidContinuationCodeUnit = true;
199  }
200  }
201  else if (curIndex == 1 && ((*curCharStartByte & 0b11111111) == 0b11110100)) // Check for 11110100 (0xF4)
202  {
203  // For 0b11110100 (0xF4) : Second byte must be 0b10000000 - 0b10001111 (0x80 - 0x8F)
204  if ((*c & 0b11110000) == 0b10000000) // Check for 1000xxxx (0x80-0x8F)
205  {
206  isValidContinuationCodeUnit = true;
207  }
208  }
209  else
210  {
211  // Validate that the continuation code unit is in range 0b10000000-0b10111111 (0x80-0xBF)
212  if ((*c & 0b11000000) == 0b10000000) // Check for 10xxxxxx (0x80-0xBF)
213  {
214  isValidContinuationCodeUnit = true;
215  }
216  }
217 
218  if (isValidContinuationCodeUnit)
219  {
220  // At the last byte of the current character
221  if (curIndex == (curNumBytes - 1))
222  {
223  expectStartByte = true;
224  }
225  }
226  else
227  {
228  return false;
229  }
230  }
231 
232  curIndex++;
233  c++;
234  }
235 
236  // If we're still in a character, then string is invalid
237  return expectStartByte;
238  }
239 }
bool IsUtf8SequenceByte(char b)
Definition: utf8.h:22
int CountUtf8Bytes(const char *start, int numCharacters)
Definition: utf8.h:56
bool IsUtf8StartByte(char b)
Definition: utf8.h:15
JSON (JavaScript Object Notation).
Definition: adsapi.h:16
bool IsValidUtf8(const char *str, bool &hasFourByteChars)
Definition: utf8.h:82
const char * AdvanceToNextUtf8Character(const char *cur, int &sequenceLength)
Definition: utf8.h:28