#! /usr/bin/env python3

# Python has a built-in module for dealing with Unicode strings
# Updated regularly to match the latest Unicode standard
import unicodedata

string1 = "Hello World"         # normal ASCII
string2 = "Hellо Wоrld"         # These are not latin o's (cyrillic)
string3 = "Hellⲟ Wօrld"         # These are also not latin o's (coptic and armenian)
string4 = "Ⓗⓔⓛⓛⓞ Ⓦⓞⓡⓛⓓ" # letters in circles
string5 = "Hëllo World"         # e with a diaeresis (one character)
string6 = "Hëllo World"         # latin small letter e followed by a combining diaeresis (two characters)

def tryEqualities(s1, s2):
    return (
        s1 == s2,
        # normalization rules are used to compare UNICODE characters that are semantically equivalent even if they are not identical
        # NFC is Canonical Composition
        # NFKC is Compatibility Composition
        # NFD is Canonical Decomposition
        # NFKD is Compatibility Decomposition
        # Compatibility is a less strict equality than Canonical
        # Composition means that eg "letter e followed by a combining diaeresis" is converted to "e with a diaeresis"
        # Decomposition means that eg "e with a diaeresis" is converted to "letter e followed by a combining diaeresis"
        unicodedata.normalize('NFC',  s1) == unicodedata.normalize('NFC',  s2),
        unicodedata.normalize('NFKC', s1) == unicodedata.normalize('NFKC', s2),
        unicodedata.normalize('NFD',  s1) == unicodedata.normalize('NFD',  s2),
        unicodedata.normalize('NFKD', s1) == unicodedata.normalize('NFKD', s2),
    )

print("string1 == string2:", tryEqualities(string1, string2))
print("string1 == string3:", tryEqualities(string1, string3))
print("string1 == string4:", tryEqualities(string1, string4))
print("string1 == string5:", tryEqualities(string1, string5))
print("string1 == string6:", tryEqualities(string1, string6))
print("string2 == string3:", tryEqualities(string2, string3))
print("string2 == string4:", tryEqualities(string2, string4))
print("string2 == string5:", tryEqualities(string2, string5))
print("string2 == string6:", tryEqualities(string2, string6))
print("string3 == string4:", tryEqualities(string3, string4))
print("string3 == string5:", tryEqualities(string3, string5))
print("string3 == string6:", tryEqualities(string3, string6))
print("string4 == string5:", tryEqualities(string4, string5))
print("string4 == string6:", tryEqualities(string4, string6))
print("string5 == string6:", tryEqualities(string5, string6))

input()

# len() returns the number of characters in a string
# and is unicode-aware so will correctly count the number of characters in a unicode string
# not jsut the number of bytes (like in C)
# encode('utf-8') can be used to get the raw bytes of a string
# which can be used to get the number of bytes in a string

print(len(string1), len(string1.encode('utf-8')))
print(len(string2), len(string2.encode('utf-8')))
print(len(string3), len(string3.encode('utf-8')))
print(len(string4), len(string4.encode('utf-8')))
print(len(string5), len(string5.encode('utf-8')))
print(len(string6), len(string6.encode('utf-8')))