#include #include #include // 0XXXXXXX (1) // 110XXXXX (2) // 1110XXXX (3) // 11110XXX (4) // 10XXXXXX (continuing byte) // 11010111 10101111 11101101 10111100 10001011 01001101 00000000 // ^ size_t utf8_strlen(char *string) { size_t num_code_points = 0; for (char *code_point = string; *code_point != '\0';) { if ((*code_point & 0xF8) == 0xF0) { // 4-byte head byte code_point += 4; } else if ((*code_point & 0xF0) == 0xE0) { // 3-byte head byte code_point += 3; } else if ((*code_point & 0xE0) == 0xC0) { // 2-byte head byte code_point += 2; } else if ((*code_point & 0xC0) == 0x80) { // INVALID STRING // tail byte - should not be here // as we should be moving from head byte to head byte fprintf(stderr, "Invalid UTF-8 string: \"%s\"\n", string); fprintf(stderr, "Found a tail byte when head byte was expected\n"); assert(0); } else if ((*code_point & 0x80) == 0x00) { // ASCII code_point += 1; } else { // INVALID STRING // this is not a valid UTF-8 byte fprintf(stderr, "Invalid UTF-8 string: \"%s\"\n", string); fprintf(stderr, "Head byte indicates invalid length\n"); assert(0); } num_code_points++; } return num_code_points; } int main(void) { char *s = "😊"; printf("Strlen of %s is %ld...hmm...\n", s, strlen(s)); printf("Correct strlen of %s is %ld...yay!\n", s, utf8_strlen(s)); return 0; }