import unicodedata
string1 = "Hello World" # normal ASCII
string2 = "Hellо Wоrld" # These are not latin o's (cyrillic)
string3 = "Hellⲟ Wօrld" # These are also not latin o's (coptic and armenian)
string4 = "Ⓗⓔⓛⓛⓞ Ⓦⓞⓡⓛⓓ" # letters in circles
string5 = "Hëllo World" # e with a diaeresis (one character)
string6 = "Hëllo World" # latin small letter e followed by a combining diaeresis (two characters)
def tryEqualities(s1, s2):
return (
s1 == s2,
# normalization rules are used to compare UNICODE characters that are semantically equivalent even if they are not identical
# NFC is Canonical Composition
# NFKC is Compatibility Composition
# NFD is Canonical Decomposition
# NFKD is Compatibility Decomposition
# Compatibility is a less strict equality than Canonical
# Composition means that eg "letter e followed by a combining diaeresis" is converted to "e with a diaeresis"
# Decomposition means that eg "e with a diaeresis" is converted to "letter e followed by a combining diaeresis"
unicodedata.normalize('NFC', s1) == unicodedata.normalize('NFC', s2),
unicodedata.normalize('NFKC', s1) == unicodedata.normalize('NFKC', s2),
unicodedata.normalize('NFD', s1) == unicodedata.normalize('NFD', s2),
unicodedata.normalize('NFKD', s1) == unicodedata.normalize('NFKD', s2),
)
print("string1 == string2:", tryEqualities(string1, string2))
print("string1 == string3:", tryEqualities(string1, string3))
print("string1 == string4:", tryEqualities(string1, string4))
print("string1 == string5:", tryEqualities(string1, string5))
print("string1 == string6:", tryEqualities(string1, string6))
print("string2 == string3:", tryEqualities(string2, string3))
print("string2 == string4:", tryEqualities(string2, string4))
print("string2 == string5:", tryEqualities(string2, string5))
print("string2 == string6:", tryEqualities(string2, string6))
print("string3 == string4:", tryEqualities(string3, string4))
print("string3 == string5:", tryEqualities(string3, string5))
print("string3 == string6:", tryEqualities(string3, string6))
print("string4 == string5:", tryEqualities(string4, string5))
print("string4 == string6:", tryEqualities(string4, string6))
print("string5 == string6:", tryEqualities(string5, string6))
input()
# len() returns the number of characters in a string
# and is unicode-aware so will correctly count the number of characters in a unicode string
# not jsut the number of bytes (like in C)
# encode('utf-8') can be used to get the raw bytes of a string
# which can be used to get the number of bytes in a string
print(len(string1), len(string1.encode('utf-8')))
print(len(string2), len(string2.encode('utf-8')))
print(len(string3), len(string3.encode('utf-8')))
print(len(string4), len(string4.encode('utf-8')))
print(len(string5), len(string5.encode('utf-8')))
print(len(string6), len(string6.encode('utf-8')))