Computer Systems Fundamentals

#include <assert.h>

int ascii_to_bin_subtraction(char c) {
    return c - '0';
}

int ascii_to_bin_bitwise(char c) {
    return c & 0x0F;
}

char bin_to_ascii_addition(int i) {
    return i + '0';
}

char bin_to_ascii_bitwise(int i) {
    return i | 0b00110000; // or in hex `0x30`
}

int main(void) {
    assert(5 == ascii_to_bin_subtraction('5'));
    assert(5 == ascii_to_bin_bitwise('5'));
    assert(ascii_to_bin_subtraction('5') == ascii_to_bin_bitwise('5'));

    assert('5' == bin_to_ascii_addition(5));
    assert('5' == bin_to_ascii_bitwise(5));
    assert(bin_to_ascii_addition(5) == bin_to_ascii_bitwise(5));
}
#include <assert.h>
#include <string.h>
#include <stdbool.h>
#include <ctype.h>

char *to_upper_subtraction(char *s) {
    for (int i = 0; s[i]; i++) {
        if (s[i] >= 'a' && s[i] <= 'z') {
            s[i] -= 32; // or in hex `0x20`
        }
    }
    return s;
}

char *to_upper_bitwise(char * s) {
    for (int i = 0; s[i]; i++) {
        if (s[i] >= 'a' && s[i] <= 'z') {
            s[i] &= 0b11011111; // or in hex `~0x20`
        }
    }
    return s;
}

bool case_insensitive_compare_bitwise(char *s1, char *s2) {
    for (int i = 0; s1[i] && s2[i]; i++) {
        if (isalpha(s1[i]) && isalpha(s2[i])) {
            // Alphabetical character
            // Compare ignoring case
            if ((s1[i] | 0b00100000) != (s2[i] | 0x20)) {
                return false;
            }
        } else {
            // Non-Alphabetical character
            // Normal comparison
            if (s1[i] != s2[i]) {
                return false;
            }
        }
    }
    return true;
}

int main(void) {
    char s1[] = "Hello, World!";
    char s2[] = "Hello, World!";
    assert(0 == strcmp("HELLO, WORLD!", to_upper_subtraction(s1)));
    assert(0 == strcmp("HELLO, WORLD!", to_upper_bitwise(s2)));

    char s3[] = "HeLLo, WOrLD!";
    char s4[] = "hEllo, WORld!";
    assert(case_insensitive_compare_bitwise(s3, s4));
}
#include <stdio.h>
#include <string.h>

#define cmp(s1, s2) strcmp(s1, s2) ? "Not Equal" : "Equal"

int main(void) {
    char *string1 = "Hello World";         // normal ASCII
    char *string2 = "Hellо Wоrld";         // These are not latin o's
    char *string3 = "Hellⲟ W𐓪rld";         // These are also not latin o's and different from the above non-latin o's
    char *string4 = "Ⓗⓔⓛⓛⓞ Ⓦⓞⓡⓛⓓ"; // letters in circles, sure that exists in UNICODE for some reason
    char *string5 = "Hëllo World";         // e with a diaeresis (one character)
    char *string6 = "Hëllo World";         // latin small letter e followed by a combining diaeresis (two characters)

    printf("string1 == string2: %s\n", cmp(string1, string2));
    printf("string1 == string3: %s\n", cmp(string1, string3));
    printf("string1 == string4: %s\n", cmp(string1, string4));
    printf("string1 == string5: %s\n", cmp(string1, string5));
    printf("string1 == string6: %s\n", cmp(string1, string6));
    printf("string2 == string3: %s\n", cmp(string2, string3));
    printf("string2 == string4: %s\n", cmp(string2, string4));
    printf("string2 == string5: %s\n", cmp(string2, string5));
    printf("string2 == string6: %s\n", cmp(string2, string6));
    printf("string3 == string4: %s\n", cmp(string3, string4));
    printf("string3 == string5: %s\n", cmp(string3, string5));
    printf("string3 == string6: %s\n", cmp(string3, string6));
    printf("string4 == string5: %s\n", cmp(string4, string5));
    printf("string4 == string6: %s\n", cmp(string4, string6));
    printf("string5 == string6: %s\n", cmp(string5, string6));

    char _; scanf("%c", &_);

    printf("string1: %lu\n", strlen(string1));
    printf("string2: %lu\n", strlen(string2));
    printf("string3: %lu\n", strlen(string3));
    printf("string4: %lu\n", strlen(string4));
    printf("string5: %lu\n", strlen(string5));
    printf("string6: %lu\n", strlen(string6));
}


Python has a built-in module for dealing with Unicode strings
Updated regularly to match the latest Unicode standard
import unicodedata

string1 = "Hello World";         # normal ASCII
string2 = "Hellо Wоrld";         # These are not latin o's
string3 = "Hellⲟ W𐓪rld";         # These are also not latin o's and different from the above non-latin o's
string4 = "Ⓗⓔⓛⓛⓞ Ⓦⓞⓡⓛⓓ"; # letters in circles, sure that exists in UNICODE for some reason
string5 = "Hëllo World";         # e with a diaeresis (one character)
string6 = "Hëllo World";         # latin small letter e followed by a combining diaeresis (two characters)

def tryEqualities(s1, s2):
    return (
        s1 == s2,
        # normalization rules are used to compare UNICODE characters that are semantically equivalent even if they are not identical
        # NFC is Canonical Composition
        # NFKC is Compatibility Composition
        # NFD is Canonical Decomposition
        # NFKD is Compatibility Decomposition
        # Compatibility is a less strict equality than Canonical
        # Composition means that eg "letter e followed by a combining diaeresis" is converted to "e with a diaeresis"
        # Decomposition means that eg "e with a diaeresis" is converted to "letter e followed by a combining diaeresis"
        unicodedata.normalize('NFC',  s1) == unicodedata.normalize('NFC',  s2),
        unicodedata.normalize('NFKC', s1) == unicodedata.normalize('NFKC', s2),
        unicodedata.normalize('NFD',  s1) == unicodedata.normalize('NFD',  s2),
        unicodedata.normalize('NFKD', s1) == unicodedata.normalize('NFKD', s2),
    )

print("string1 == string2:", tryEqualities(string1, string2))
print("string1 == string3:", tryEqualities(string1, string3))
print("string1 == string4:", tryEqualities(string1, string4))
print("string1 == string5:", tryEqualities(string1, string5))
print("string1 == string6:", tryEqualities(string1, string6))
print("string2 == string3:", tryEqualities(string2, string3))
print("string2 == string4:", tryEqualities(string2, string4))
print("string2 == string5:", tryEqualities(string2, string5))
print("string2 == string6:", tryEqualities(string2, string6))
print("string3 == string4:", tryEqualities(string3, string4))
print("string3 == string5:", tryEqualities(string3, string5))
print("string3 == string6:", tryEqualities(string3, string6))
print("string4 == string5:", tryEqualities(string4, string5))
print("string4 == string6:", tryEqualities(string4, string6))
print("string5 == string6:", tryEqualities(string5, string6))

input()

print(len(string1))
print(len(string2))
print(len(string3))
print(len(string4))
print(len(string5))
print(len(string6))
#include <stdio.h>

int main(void) {
    printf("The unicode code point U+1F600 encodes in UTF-8\n");
    printf("as 4 bytes: 0xF0 0x9F 0x98 0x80\n");
    printf("We can output the 4 bytes like this: \xF0\x9F\x98\x80 (UTF-8)\n");
    printf("Or like this: ");
    putchar(0xF0);
    putchar(0x9F);
    putchar(0x98);
    putchar(0x80);
    putchar('\n');
    printf("Or like this: \U0001F600 (UTF-32)\n");
    // UNICODE code point less than 0x10000 (ie the BMP) can be encoded with
    // \uXXXX (lowercase u) with only 4 hex digits
    // \U must always be followed by 8 hex digits
}
#include <stdio.h>
#include <string.h>
#include <assert.h>

unsigned long utf8_strlen(char *string) {
    unsigned long num_code_points = 0;
    for (char *code_point = string; *code_point;) {
        if ((*code_point & 0xF8) == 0xF0) {
            // 4-byte head byte
            code_point += 4;
        } else if ((*code_point & 0xF0) == 0xE0) {
            // 3-byte head byte
            code_point += 3;
        } else if ((*code_point & 0xE0) == 0xC0) {
            // 2-byte head byte
            code_point += 2;
        } else if ((*code_point & 0xC0) == 0x80) {
            // INVALID STRING
            // tail byte - should not be here
            // as we should be moving from head byte to head byte
            fprintf(stderr, "Invalid UTF-8 string: \"%s\"\n", string);
            fprintf(stderr, "Found a tail byte when head byte was expected\n");
            assert(0);
        } else if ((*code_point & 0x80) == 0x00) {
            // ASCII
            code_point += 1;
        } else {
            // INVALID STRING
            // this is not a valid UTF-8 byte
            fprintf(stderr, "Invalid UTF-8 string: \"%s\"\n", string);
            fprintf(stderr, "Head byte indicates invalid length\n");
            assert(0);
        }
        num_code_points++;
    }

    return num_code_points;
}

int main(void) {
    char *string1 = "Hello World";
    char *string2 = "Hellо Wоrld";
    char *string3 = "Hellⲟ W𐓪rld";
    char *string4 = "Ⓗⓔⓛⓛⓞ Ⓦⓞⓡⓛⓓ";
    char *string5 = "Hëllo World";
    char *string6 = "Hëllo World";

    printf("\"%s\": strlen=%lu, utf8_strlen=%lu\n", string1, strlen(string1), utf8_strlen(string1));
    printf("\"%s\": strlen=%lu, utf8_strlen=%lu\n", string2, strlen(string2), utf8_strlen(string2));
    printf("\"%s\": strlen=%lu, utf8_strlen=%lu\n", string3, strlen(string3), utf8_strlen(string3));
    printf("\"%s\": strlen=%lu, utf8_strlen=%lu\n", string4, strlen(string4), utf8_strlen(string4));
    printf("\"%s\": strlen=%lu, utf8_strlen=%lu\n", string5, strlen(string5), utf8_strlen(string5));
    printf("\"%s\": strlen=%lu, utf8_strlen=%lu\n", string6, strlen(string6), utf8_strlen(string6));
}
#include <stdio.h>
#include <stdint.h>

void print_utf8_encoding(uint32_t code_point) {
    uint8_t encoding[5] = {0};

    if (code_point < 0x80) {
        encoding[0] = code_point;
    } else if (code_point < 0x800) {
        encoding[0] = 0xC0 | (code_point >> 6);
        encoding[1] = 0x80 | (code_point & 0x3f);
    } else if (code_point < 0x10000) {
        encoding[0] = 0xE0 | (code_point >> 12);
        encoding[1] = 0x80 | ((code_point >> 6) & 0x3f);
        encoding[2] = 0x80 | (code_point  & 0x3f);
    } else if (code_point < 0x200000) {
        encoding[0] = 0xF0 | (code_point >> 18);
        encoding[1] = 0x80 | ((code_point >> 12) & 0x3f);
        encoding[2] = 0x80 | ((code_point >> 6)  & 0x3f);
        encoding[3] = 0x80 | (code_point  & 0x3f);
    }

    printf("U+%04x, UTF-32: 0x%08x, UTF-8: ", code_point, code_point);
    for (uint8_t *s = encoding; *s != 0; s++) {
        printf("0x%02x ", *s);
    }
    printf(" %s\n", encoding);
}

int main(void) {
    print_utf8_encoding(0x0042);
    print_utf8_encoding(0x00A2);
    print_utf8_encoding(0x10be);
    print_utf8_encoding(0x1F600);
}

a = """\
ÒÏÓÓÉÑ, ÍÏÓË×Á, 119415
ÐÒ.÷ÅÒÎÁÄÓËÏÇÏ, 37,
Ë.1817-1,
ÖÌÅÔÎÅ×ÏÊ Ó×ÅÔÌÁÎÅ
"""

print("with latin-1 encoding:")
print(a)

b = a.encode('latin-1')

print("raw bytes:")
for byte in b.splitlines():
    print(byte)
print()

c = b.decode('koi8_r')

print("with koi8_r encoding:")
print(c)
#include <stdio.h>
#include <string.h>

int main(void) {

    // This variable doesn't look like it is valid C
    // As variable must be single words
    // replacing the space with an underscore `_` would make it valid

    // EXCEPT that the space is not a space
    // it's the "HALFWIDTH HANGUL FILLER" character
    // the HALFWIDTH HANGUL FILLER is rather special
    // as it is one of very few characters that are invisible
    // ie don't have a visible glyph
    // yet isn't a whitespace character (has the WhiteSpace=yes UNICODE property)
    // This means that C allows it to be used in variable names

    int helloᅠworld = 20;
    printf("hello world = %d\n", helloᅠworld);

    // Please never actually use this character in variable names

    return 0;
}
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include <stdbool.h>

// most modern text editors will show you the invisible characters in this file
// because this file demonstrates a very real security vulnerability
// In vscode you can disable this by adding the following to your settings.json
//      "editor.renderWhitespace": "none",
//      "editor.unicodeHighlight.ambiguousCharacters": false,
//      "editor.unicodeHighlight.invisibleCharacters": false,
//      "editor.unicodeHighlight.nonBasicASCII": false,
//      "editor.renderControlCharacters": false,
// But you should immediately re-enable it after you have finished reading this file


bool strings_are_equal(char *s1, char *s2) {
    printf("Comparing \"%s\" and \"%s\"\n", s1, s2);
    return strcmp(s1, s2) == 0;
}

int main(int argc, char *argv[]) {
    if (argc != 2) {
        printf("Usage: %s <access_level>\n", argv[0]);
        printf("Access levels:\n");
        printf("0: user, 1: admin\n");
        return 1;
    }

    char *access_levels[] = {"user", "admin"};

    int selected_access_level = atoi(argv[1]);

    char *current_access_level = access_levels[selected_access_level];

    printf("Current access level: %s\n", current_access_level);

    // with invisible characters not visible
    // this next line will likely look like:
    //      if (!strings_are_equal(current_access_level, "user")) // Check if admin
    // But that's not actully what it is
    // the real code is:
    //      if (!strings_are_equal(current_access_level, "user <invisible-characters>// Check if admin<invisible-characters>"))
    // The invisible characters here are the left-to-right mark (U+200E) and the left-to-right embedding (U+202A)
    // These characters tell the text editor to render the text in a forced direction
    // Instead of the normal direction of the text (left-to-right in the western world, right-to-left in the arabic world)
    // Forcing the text to be rendered in a different direction is a common way to make text appear to be something it is not
    // Like in tis case where what looks like a comment is actually part of the string
    if (!strings_are_equal(current_access_level, "user‮ ⁦// Check if admin⁩ ⁦"))
    {
        printf("Hello admin.\n");
    }

    return 0;
}
#include <stdio.h>
#include <string.h>

// most modern text editors will show you the invisible characters in this file
// because this file demonstrates a very real security vulnerability
// In vscode you can disable this by adding the following to your settings.json
//      "editor.renderWhitespace": "none",
//      "editor.unicodeHighlight.ambiguousCharacters": false,
//      "editor.unicodeHighlight.invisibleCharacters": false,
//      "editor.unicodeHighlight.nonBasicASCII": false,
//      "editor.renderControlCharacters": false,
// But you should immediately re-enable it after you have finished reading this file

int main(void) {
    // These three variables look to be all the same
    // But they are not, they are all different variables
    // Each variable uses a different unicode character for the `o`
    // The first variable uses the latin small letter o (U+006F)
    // The second variable uses the armenian small letter o (U+0585)
    // The third variable uses the cyrillic small letter o (U+043E)
    // They all look the same, but the C compiler treats them as different variables

    int total;
    int tօtal;
    int tоtal = -666;

    total = 100;
    printf("total = %d\n", total); // 100

    tօtal = 999;
    printf("total = %d\n", total); // 999

    total = tօtal;
    printf("total = %d\n", tоtal); // 999

    printf("total = %d\n", tօtal); // 999

    return 0;
}