#! /usr/bin/env python3 # Python has a built-in module for dealing with Unicode strings # Updated regularly to match the latest Unicode standard import unicodedata string1 = "Hello World" # normal ASCII string2 = "Hellо Wоrld" # These are not latin o's (cyrillic) string3 = "Hellⲟ Wօrld" # These are also not latin o's (coptic and armenian) string4 = "Ⓗⓔⓛⓛⓞ Ⓦⓞⓡⓛⓓ" # letters in circles string5 = "Hëllo World" # e with a diaeresis (one character) string6 = "Hëllo World" # latin small letter e followed by a combining diaeresis (two characters) def tryEqualities(s1, s2): return ( s1 == s2, # normalization rules are used to compare UNICODE characters that are semantically equivalent even if they are not identical # NFC is Canonical Composition # NFKC is Compatibility Composition # NFD is Canonical Decomposition # NFKD is Compatibility Decomposition # Compatibility is a less strict equality than Canonical # Composition means that eg "letter e followed by a combining diaeresis" is converted to "e with a diaeresis" # Decomposition means that eg "e with a diaeresis" is converted to "letter e followed by a combining diaeresis" unicodedata.normalize('NFC', s1) == unicodedata.normalize('NFC', s2), unicodedata.normalize('NFKC', s1) == unicodedata.normalize('NFKC', s2), unicodedata.normalize('NFD', s1) == unicodedata.normalize('NFD', s2), unicodedata.normalize('NFKD', s1) == unicodedata.normalize('NFKD', s2), ) print("string1 == string2:", tryEqualities(string1, string2)) print("string1 == string3:", tryEqualities(string1, string3)) print("string1 == string4:", tryEqualities(string1, string4)) print("string1 == string5:", tryEqualities(string1, string5)) print("string1 == string6:", tryEqualities(string1, string6)) print("string2 == string3:", tryEqualities(string2, string3)) print("string2 == string4:", tryEqualities(string2, string4)) print("string2 == string5:", tryEqualities(string2, string5)) print("string2 == string6:", tryEqualities(string2, string6)) print("string3 == string4:", tryEqualities(string3, string4)) print("string3 == string5:", tryEqualities(string3, string5)) print("string3 == string6:", tryEqualities(string3, string6)) print("string4 == string5:", tryEqualities(string4, string5)) print("string4 == string6:", tryEqualities(string4, string6)) print("string5 == string6:", tryEqualities(string5, string6)) input() # len() returns the number of characters in a string # and is unicode-aware so will correctly count the number of characters in a unicode string # not jsut the number of bytes (like in C) # encode('utf-8') can be used to get the raw bytes of a string # which can be used to get the number of bytes in a string print(len(string1), len(string1.encode('utf-8'))) print(len(string2), len(string2.encode('utf-8'))) print(len(string3), len(string3.encode('utf-8'))) print(len(string4), len(string4.encode('utf-8'))) print(len(string5), len(string5.encode('utf-8'))) print(len(string6), len(string6.encode('utf-8')))