#include #include #include unsigned long utf8_strlen(char *string) { unsigned long num_code_points = 0; for (char *code_point = string; *code_point;) { if ((*code_point & 0xF8) == 0xF0) { // 4-byte head byte code_point += 4; } else if ((*code_point & 0xF0) == 0xE0) { // 3-byte head byte code_point += 3; } else if ((*code_point & 0xE0) == 0xC0) { // 2-byte head byte code_point += 2; } else if ((*code_point & 0xC0) == 0x80) { // INVALID STRING // tail byte - should not be here // as we should be moving from head byte to head byte fprintf(stderr, "Invalid UTF-8 string: \"%s\"\n", string); fprintf(stderr, "Found a tail byte when head byte was expected\n"); assert(0); } else if ((*code_point & 0x80) == 0x00) { // ASCII code_point += 1; } else { // INVALID STRING // this is not a valid UTF-8 byte fprintf(stderr, "Invalid UTF-8 string: \"%s\"\n", string); fprintf(stderr, "Head byte indicates invalid length\n"); assert(0); } num_code_points++; } return num_code_points; } int main(void) { char *string1 = "Hello World"; char *string2 = "Hellо Wоrld"; char *string3 = "Hellⲟ W𐓪rld"; char *string4 = "Ⓗⓔⓛⓛⓞ Ⓦⓞⓡⓛⓓ"; char *string5 = "Hëllo World"; char *string6 = "Hëllo World"; printf("\"%s\": strlen=%lu, utf8_strlen=%lu\n", string1, strlen(string1), utf8_strlen(string1)); printf("\"%s\": strlen=%lu, utf8_strlen=%lu\n", string2, strlen(string2), utf8_strlen(string2)); printf("\"%s\": strlen=%lu, utf8_strlen=%lu\n", string3, strlen(string3), utf8_strlen(string3)); printf("\"%s\": strlen=%lu, utf8_strlen=%lu\n", string4, strlen(string4), utf8_strlen(string4)); printf("\"%s\": strlen=%lu, utf8_strlen=%lu\n", string5, strlen(string5), utf8_strlen(string5)); printf("\"%s\": strlen=%lu, utf8_strlen=%lu\n", string6, strlen(string6), utf8_strlen(string6)); }