Computer Systems Fundamentals
#include <assert.h>
#include <string.h>
#include <stdbool.h>
#include <ctype.h>
char *to_upper_subtraction(char *s) {
for (int i = 0; s[i]; i++) {
if (s[i] >= 'a' && s[i] <= 'z') {
s[i] -= 32; // or in hex `0x20`
}
}
return s;
}
char *to_upper_bitwise(char * s) {
for (int i = 0; s[i]; i++) {
if (s[i] >= 'a' && s[i] <= 'z') {
s[i] &= 0b11011111; // or in hex `~0x20`
}
}
return s;
}
bool case_insensitive_compare_bitwise(char *s1, char *s2) {
for (int i = 0; s1[i] && s2[i]; i++) {
if (isalpha(s1[i]) && isalpha(s2[i])) {
// Alphabetical character
// Compare ignoring case
if ((s1[i] | 0b00100000) != (s2[i] | 0x20)) {
return false;
}
} else {
// Non-Alphabetical character
// Normal comparison
if (s1[i] != s2[i]) {
return false;
}
}
}
return true;
}
int main(void) {
char s1[] = "Hello, World!";
char s2[] = "Hello, World!";
assert(0 == strcmp("HELLO, WORLD!", to_upper_subtraction(s1)));
assert(0 == strcmp("HELLO, WORLD!", to_upper_bitwise(s2)));
char s3[] = "HeLLo, WOrLD!";
char s4[] = "hEllo, WORld!";
assert(case_insensitive_compare_bitwise(s3, s4));
}
#include <stdio.h>
#include <string.h>
#define cmp(s1, s2) strcmp(s1, s2) ? "Not Equal" : "Equal"
int main(void) {
char *string1 = "Hello World"; // normal ASCII
char *string2 = "Hellо Wоrld"; // These are not latin o's
char *string3 = "Hellⲟ W𐓪rld"; // These are also not latin o's and different from the above non-latin o's
char *string4 = "Ⓗⓔⓛⓛⓞ Ⓦⓞⓡⓛⓓ"; // letters in circles, sure that exists in UNICODE for some reason
char *string5 = "Hëllo World"; // e with a diaeresis (one character)
char *string6 = "Hëllo World"; // latin small letter e followed by a combining diaeresis (two characters)
printf("string1 == string2: %s\n", cmp(string1, string2));
printf("string1 == string3: %s\n", cmp(string1, string3));
printf("string1 == string4: %s\n", cmp(string1, string4));
printf("string1 == string5: %s\n", cmp(string1, string5));
printf("string1 == string6: %s\n", cmp(string1, string6));
printf("string2 == string3: %s\n", cmp(string2, string3));
printf("string2 == string4: %s\n", cmp(string2, string4));
printf("string2 == string5: %s\n", cmp(string2, string5));
printf("string2 == string6: %s\n", cmp(string2, string6));
printf("string3 == string4: %s\n", cmp(string3, string4));
printf("string3 == string5: %s\n", cmp(string3, string5));
printf("string3 == string6: %s\n", cmp(string3, string6));
printf("string4 == string5: %s\n", cmp(string4, string5));
printf("string4 == string6: %s\n", cmp(string4, string6));
printf("string5 == string6: %s\n", cmp(string5, string6));
char _; scanf("%c", &_);
printf("string1: %lu\n", strlen(string1));
printf("string2: %lu\n", strlen(string2));
printf("string3: %lu\n", strlen(string3));
printf("string4: %lu\n", strlen(string4));
printf("string5: %lu\n", strlen(string5));
printf("string6: %lu\n", strlen(string6));
}
Python has a built-in module for dealing with Unicode strings
Updated regularly to match the latest Unicode standard
import unicodedata
string1 = "Hello World"; # normal ASCII
string2 = "Hellо Wоrld"; # These are not latin o's
string3 = "Hellⲟ W𐓪rld"; # These are also not latin o's and different from the above non-latin o's
string4 = "Ⓗⓔⓛⓛⓞ Ⓦⓞⓡⓛⓓ"; # letters in circles, sure that exists in UNICODE for some reason
string5 = "Hëllo World"; # e with a diaeresis (one character)
string6 = "Hëllo World"; # latin small letter e followed by a combining diaeresis (two characters)
def tryEqualities(s1, s2):
return (
s1 == s2,
# normalization rules are used to compare UNICODE characters that are semantically equivalent even if they are not identical
# NFC is Canonical Composition
# NFKC is Compatibility Composition
# NFD is Canonical Decomposition
# NFKD is Compatibility Decomposition
# Compatibility is a less strict equality than Canonical
# Composition means that eg "letter e followed by a combining diaeresis" is converted to "e with a diaeresis"
# Decomposition means that eg "e with a diaeresis" is converted to "letter e followed by a combining diaeresis"
unicodedata.normalize('NFC', s1) == unicodedata.normalize('NFC', s2),
unicodedata.normalize('NFKC', s1) == unicodedata.normalize('NFKC', s2),
unicodedata.normalize('NFD', s1) == unicodedata.normalize('NFD', s2),
unicodedata.normalize('NFKD', s1) == unicodedata.normalize('NFKD', s2),
)
print("string1 == string2:", tryEqualities(string1, string2))
print("string1 == string3:", tryEqualities(string1, string3))
print("string1 == string4:", tryEqualities(string1, string4))
print("string1 == string5:", tryEqualities(string1, string5))
print("string1 == string6:", tryEqualities(string1, string6))
print("string2 == string3:", tryEqualities(string2, string3))
print("string2 == string4:", tryEqualities(string2, string4))
print("string2 == string5:", tryEqualities(string2, string5))
print("string2 == string6:", tryEqualities(string2, string6))
print("string3 == string4:", tryEqualities(string3, string4))
print("string3 == string5:", tryEqualities(string3, string5))
print("string3 == string6:", tryEqualities(string3, string6))
print("string4 == string5:", tryEqualities(string4, string5))
print("string4 == string6:", tryEqualities(string4, string6))
print("string5 == string6:", tryEqualities(string5, string6))
input()
print(len(string1))
print(len(string2))
print(len(string3))
print(len(string4))
print(len(string5))
print(len(string6))
#include <stdio.h>
int main(void) {
printf("The unicode code point U+1F600 encodes in UTF-8\n");
printf("as 4 bytes: 0xF0 0x9F 0x98 0x80\n");
printf("We can output the 4 bytes like this: \xF0\x9F\x98\x80 (UTF-8)\n");
printf("Or like this: ");
putchar(0xF0);
putchar(0x9F);
putchar(0x98);
putchar(0x80);
putchar('\n');
printf("Or like this: \U0001F600 (UTF-32)\n");
// UNICODE code point less than 0x10000 (ie the BMP) can be encoded with
// \uXXXX (lowercase u) with only 4 hex digits
// \U must always be followed by 8 hex digits
}
#include <stdio.h>
#include <string.h>
#include <assert.h>
unsigned long utf8_strlen(char *string) {
unsigned long num_code_points = 0;
for (char *code_point = string; *code_point;) {
if ((*code_point & 0xF8) == 0xF0) {
// 4-byte head byte
code_point += 4;
} else if ((*code_point & 0xF0) == 0xE0) {
// 3-byte head byte
code_point += 3;
} else if ((*code_point & 0xE0) == 0xC0) {
// 2-byte head byte
code_point += 2;
} else if ((*code_point & 0xC0) == 0x80) {
// INVALID STRING
// tail byte - should not be here
// as we should be moving from head byte to head byte
fprintf(stderr, "Invalid UTF-8 string: \"%s\"\n", string);
fprintf(stderr, "Found a tail byte when head byte was expected\n");
assert(0);
} else if ((*code_point & 0x80) == 0x00) {
// ASCII
code_point += 1;
} else {
// INVALID STRING
// this is not a valid UTF-8 byte
fprintf(stderr, "Invalid UTF-8 string: \"%s\"\n", string);
fprintf(stderr, "Head byte indicates invalid length\n");
assert(0);
}
num_code_points++;
}
return num_code_points;
}
int main(void) {
char *string1 = "Hello World";
char *string2 = "Hellо Wоrld";
char *string3 = "Hellⲟ W𐓪rld";
char *string4 = "Ⓗⓔⓛⓛⓞ Ⓦⓞⓡⓛⓓ";
char *string5 = "Hëllo World";
char *string6 = "Hëllo World";
printf("\"%s\": strlen=%lu, utf8_strlen=%lu\n", string1, strlen(string1), utf8_strlen(string1));
printf("\"%s\": strlen=%lu, utf8_strlen=%lu\n", string2, strlen(string2), utf8_strlen(string2));
printf("\"%s\": strlen=%lu, utf8_strlen=%lu\n", string3, strlen(string3), utf8_strlen(string3));
printf("\"%s\": strlen=%lu, utf8_strlen=%lu\n", string4, strlen(string4), utf8_strlen(string4));
printf("\"%s\": strlen=%lu, utf8_strlen=%lu\n", string5, strlen(string5), utf8_strlen(string5));
printf("\"%s\": strlen=%lu, utf8_strlen=%lu\n", string6, strlen(string6), utf8_strlen(string6));
}
#include <stdio.h>
#include <stdint.h>
void print_utf8_encoding(uint32_t code_point) {
uint8_t encoding[5] = {0};
if (code_point < 0x80) {
encoding[0] = code_point;
} else if (code_point < 0x800) {
encoding[0] = 0xC0 | (code_point >> 6);
encoding[1] = 0x80 | (code_point & 0x3f);
} else if (code_point < 0x10000) {
encoding[0] = 0xE0 | (code_point >> 12);
encoding[1] = 0x80 | ((code_point >> 6) & 0x3f);
encoding[2] = 0x80 | (code_point & 0x3f);
} else if (code_point < 0x200000) {
encoding[0] = 0xF0 | (code_point >> 18);
encoding[1] = 0x80 | ((code_point >> 12) & 0x3f);
encoding[2] = 0x80 | ((code_point >> 6) & 0x3f);
encoding[3] = 0x80 | (code_point & 0x3f);
}
printf("U+%04x, UTF-32: 0x%08x, UTF-8: ", code_point, code_point);
for (uint8_t *s = encoding; *s != 0; s++) {
printf("0x%02x ", *s);
}
printf(" %s\n", encoding);
}
int main(void) {
print_utf8_encoding(0x0042);
print_utf8_encoding(0x00A2);
print_utf8_encoding(0x10be);
print_utf8_encoding(0x1F600);
}
a = """\
ÒÏÓÓÉÑ, ÍÏÓË×Á, 119415
ÐÒ.÷ÅÒÎÁÄÓËÏÇÏ, 37,
Ë.1817-1,
ÖÌÅÔÎÅ×ÏÊ Ó×ÅÔÌÁÎÅ
"""
print("with latin-1 encoding:")
print(a)
b = a.encode('latin-1')
print("raw bytes:")
for byte in b.splitlines():
print(byte)
print()
c = b.decode('koi8_r')
print("with koi8_r encoding:")
print(c)
#include <stdio.h>
#include <string.h>
int main(void) {
// This variable doesn't look like it is valid C
// As variable must be single words
// replacing the space with an underscore `_` would make it valid
// EXCEPT that the space is not a space
// it's the "HALFWIDTH HANGUL FILLER" character
// the HALFWIDTH HANGUL FILLER is rather special
// as it is one of very few characters that are invisible
// ie don't have a visible glyph
// yet isn't a whitespace character (has the WhiteSpace=yes UNICODE property)
// This means that C allows it to be used in variable names
int helloᅠworld = 20;
printf("hello world = %d\n", helloᅠworld);
// Please never actually use this character in variable names
return 0;
}
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include <stdbool.h>
// most modern text editors will show you the invisible characters in this file
// because this file demonstrates a very real security vulnerability
// In vscode you can disable this by adding the following to your settings.json
// "editor.renderWhitespace": "none",
// "editor.unicodeHighlight.ambiguousCharacters": false,
// "editor.unicodeHighlight.invisibleCharacters": false,
// "editor.unicodeHighlight.nonBasicASCII": false,
// "editor.renderControlCharacters": false,
// But you should immediately re-enable it after you have finished reading this file
bool strings_are_equal(char *s1, char *s2) {
printf("Comparing \"%s\" and \"%s\"\n", s1, s2);
return strcmp(s1, s2) == 0;
}
int main(int argc, char *argv[]) {
if (argc != 2) {
printf("Usage: %s <access_level>\n", argv[0]);
printf("Access levels:\n");
printf("0: user, 1: admin\n");
return 1;
}
char *access_levels[] = {"user", "admin"};
int selected_access_level = atoi(argv[1]);
char *current_access_level = access_levels[selected_access_level];
printf("Current access level: %s\n", current_access_level);
// with invisible characters not visible
// this next line will likely look like:
// if (!strings_are_equal(current_access_level, "user")) // Check if admin
// But that's not actully what it is
// the real code is:
// if (!strings_are_equal(current_access_level, "user <invisible-characters>// Check if admin<invisible-characters>"))
// The invisible characters here are the left-to-right mark (U+200E) and the left-to-right embedding (U+202A)
// These characters tell the text editor to render the text in a forced direction
// Instead of the normal direction of the text (left-to-right in the western world, right-to-left in the arabic world)
// Forcing the text to be rendered in a different direction is a common way to make text appear to be something it is not
// Like in tis case where what looks like a comment is actually part of the string
if (!strings_are_equal(current_access_level, "user // Check if admin "))
{
printf("Hello admin.\n");
}
return 0;
}
#include <stdio.h>
#include <string.h>
// most modern text editors will show you the invisible characters in this file
// because this file demonstrates a very real security vulnerability
// In vscode you can disable this by adding the following to your settings.json
// "editor.renderWhitespace": "none",
// "editor.unicodeHighlight.ambiguousCharacters": false,
// "editor.unicodeHighlight.invisibleCharacters": false,
// "editor.unicodeHighlight.nonBasicASCII": false,
// "editor.renderControlCharacters": false,
// But you should immediately re-enable it after you have finished reading this file
int main(void) {
// These three variables look to be all the same
// But they are not, they are all different variables
// Each variable uses a different unicode character for the `o`
// The first variable uses the latin small letter o (U+006F)
// The second variable uses the armenian small letter o (U+0585)
// The third variable uses the cyrillic small letter o (U+043E)
// They all look the same, but the C compiler treats them as different variables
int total;
int tօtal;
int tоtal = -666;
total = 100;
printf("total = %d\n", total); // 100
tօtal = 999;
printf("total = %d\n", total); // 999
total = tօtal;
printf("total = %d\n", tоtal); // 999
printf("total = %d\n", tօtal); // 999
return 0;
}