Computer Systems Fundamentals

            ASCII_to_DEC.c
        

        

    
#include <assert.h>

/**
 * @brief Convert an ASCII character to a decimal digit
 *
 * Subtract '0' from the character to convert it to a decimal digit.
 *
 * @param c - character to convert
 * @return int - decimal digit
 */
int ascii_to_dec_subtraction(char c) {
    return c - '0';
}

/**
 * @brief Convert a decimal digit to an ASCII character
 *
 * Add '0' to the digit to convert it to an ASCII character.
 *
 * @param i - decimal digit to convert
 * @return char - ASCII character
 */
char dec_to_ascii_addition(int i) {
    return i + '0';
}

/**
 * @brief Convert an ASCII character to a decimal digit
 *
 * Bitwise AND with 0x0F to convert it to a decimal digit.
 * Keeping only the last 4 bits of the character.
 *
 * @param c - character to convert
 * @return int - decimal digit
 */
int ascii_to_dec_bitwise(char c) {
    return c & 0x0F;
}

/**
 * @brief Convert a decimal digit to an ASCII character
 *
 * Bitwise OR with 0x30 to convert it to an ASCII character.
 * Setting the 5th and 6th bits of the character.
 *
 * @param i - decimal digit to convert
 * @return char - ASCII character
 */
char dec_to_ascii_bitwise(int i) {
    return i | 0x30;
}

int main(void) {
    assert(5 == ascii_to_dec_subtraction('5'));
    assert(5 == ascii_to_dec_bitwise('5'));
    assert(ascii_to_dec_subtraction('5') == ascii_to_dec_bitwise('5'));

    assert('5' == dec_to_ascii_addition(5));
    assert('5' == dec_to_ascii_bitwise(5));
    assert(dec_to_ascii_addition(5) == dec_to_ascii_bitwise(5));
}

        
            ASCII_case_insensitive.c
        

        

    
#include <assert.h>
#include <string.h>
#include <stdbool.h>
#include <ctype.h>

/**
 * @brief Convert a string to uppercase
 *
 * Subtract 32 from each lowercase letter to convert it to uppercase.
 *
 * @param s - string to convert
 * @return char* - pointer to the converted string (same as input pointer)
 */
char *to_upper_subtraction(char *s) {
    for (int i = 0; s[i]; i++) {
        if (s[i] >= 'a' && s[i] <= 'z') {
            s[i] -= 32;
        }
    }
    return s;
}

/**
 * @brief Convert a string to uppercase
 *
 * Bitwise AND with 0xDF to convert lowercase letters to uppercase.
 *
 * @param s - string to convert
 * @return char* - pointer to the converted string (same as input pointer)
 */
char *to_upper_bitwise(char *s) {
    for (int i = 0; s[i]; i++) {
        if (s[i] >= 'a' && s[i] <= 'z') {
            s[i] &= ~0x20;
        }
    }
    return s;
}

/**
 * @brief Compare two strings, ignoring case
 *
 * This is already implemented in the standard library as strcasecmp().
 *
 * @param s1 - first string
 * @param s2 - second string
 * @return true - if the strings are equal, ignoring case
 * @return false - if the strings are not equal
 */
bool case_insensitive_compare_bitwise(char *s1, char *s2) {
    for (int i = 0; s1[i] && s2[i]; i++) {
        if (isalpha(s1[i]) && isalpha(s2[i])) {
            // Alphabetical character
            // Compare ignoring case
            // Convert both characters to uppercase lowercase
            // by inserting a 1 in the 6th bit then comparing
            if ((s1[i] | 0x20) != (s2[i] | 0x20)) {
                return false;
            }
        } else {
            // Non-Alphabetical character
            // Normal comparison
            if (s1[i] != s2[i]) {
                return false;
            }
        }
    }
    return true;
}

int main(void) {
    char s1[] = "Hello, World!";
    char s2[] = "Hello, World!";
    assert(0 == strcmp("HELLO, WORLD!", to_upper_subtraction(s1)));
    assert(0 == strcmp("HELLO, WORLD!", to_upper_bitwise(s2)));

    char s3[] = "HeLLo, WOrLD!";
    char s4[] = "hEllo, WORld!";
    assert(case_insensitive_compare_bitwise(s3, s4));
}

        
            unicode_strings.c
        

        

    
#include <stdio.h>
#include <string.h>

#define cmp(s1, s2) strcmp(s1, s2) ? "Not Equal" : "Equal"

int main(void) {
    char *string1 = "Hello World";         // normal ASCII
    char *string2 = "Hellо Wоrld";         // These are not latin o's (cyrillic)
    char *string3 = "Hellⲟ Wօrld";         // These are also not latin o's (coptic and armenian)
    char *string4 = "Ⓗⓔⓛⓛⓞ Ⓦⓞⓡⓛⓓ"; // letters in circles
    char *string5 = "Hëllo World";         // e with a diaeresis (one character)
    char *string6 = "Hëllo World";         // latin small letter e followed by a combining diaeresis (two characters)

    // The command `unicode` can be used to see the unicode code points of a string
    // eg `unicode -sm0 "Hello World" --brief` will display:
    /*
        H U+0048 LATIN CAPITAL LETTER H
        e U+0065 LATIN SMALL LETTER E
        l U+006C LATIN SMALL LETTER L
        l U+006C LATIN SMALL LETTER L
        ⲟ U+2C9F COPTIC SMALL LETTER O
        U+0020 SPACE
        W U+0057 LATIN CAPITAL LETTER W
        օ U+0585 ARMENIAN SMALL LETTER OH
        r U+0072 LATIN SMALL LETTER R
        l U+006C LATIN SMALL LETTER L
        d U+0064 LATIN SMALL LETTER D
    */

    // Even though the strings look the same, they are not the same
    // all of the strings contain different unicode characters
    // so comparing them with strcmp will return false

    printf("string1 == string2: %s\n", cmp(string1, string2));
    printf("string1 == string3: %s\n", cmp(string1, string3));
    printf("string1 == string4: %s\n", cmp(string1, string4));
    printf("string1 == string5: %s\n", cmp(string1, string5));
    printf("string1 == string6: %s\n", cmp(string1, string6));
    printf("string2 == string3: %s\n", cmp(string2, string3));
    printf("string2 == string4: %s\n", cmp(string2, string4));
    printf("string2 == string5: %s\n", cmp(string2, string5));
    printf("string2 == string6: %s\n", cmp(string2, string6));
    printf("string3 == string4: %s\n", cmp(string3, string4));
    printf("string3 == string5: %s\n", cmp(string3, string5));
    printf("string3 == string6: %s\n", cmp(string3, string6));
    printf("string4 == string5: %s\n", cmp(string4, string5));
    printf("string4 == string6: %s\n", cmp(string4, string6));
    printf("string5 == string6: %s\n", cmp(string5, string6));

    char _; scanf("%c", &_);

    // the strlen function does not count the number of characters in a string
    // it counts the number of bytes in a string
    // for ASCII strings, the number of bytes is the same as the number of characters
    // but for UTF-8 strings, the number of bytes is not the same as the number of characters

    printf("string1: %lu\n", strlen(string1));
    printf("string2: %lu\n", strlen(string2));
    printf("string3: %lu\n", strlen(string3));
    printf("string4: %lu\n", strlen(string4));
    printf("string5: %lu\n", strlen(string5));
    printf("string6: %lu\n", strlen(string6));
}

        
            unicode_strings.py
        

        



Python has a built-in module for dealing with Unicode strings

Updated regularly to match the latest Unicode standard

    
import unicodedata

string1 = "Hello World"         # normal ASCII
string2 = "Hellо Wоrld"         # These are not latin o's (cyrillic)
string3 = "Hellⲟ Wօrld"         # These are also not latin o's (coptic and armenian)
string4 = "Ⓗⓔⓛⓛⓞ Ⓦⓞⓡⓛⓓ" # letters in circles
string5 = "Hëllo World"         # e with a diaeresis (one character)
string6 = "Hëllo World"         # latin small letter e followed by a combining diaeresis (two characters)

def tryEqualities(s1, s2):
    return (
        s1 == s2,
        # normalization rules are used to compare UNICODE characters that are semantically equivalent even if they are not identical
        # NFC is Canonical Composition
        # NFKC is Compatibility Composition
        # NFD is Canonical Decomposition
        # NFKD is Compatibility Decomposition
        # Compatibility is a less strict equality than Canonical
        # Composition means that eg "letter e followed by a combining diaeresis" is converted to "e with a diaeresis"
        # Decomposition means that eg "e with a diaeresis" is converted to "letter e followed by a combining diaeresis"
        unicodedata.normalize('NFC',  s1) == unicodedata.normalize('NFC',  s2),
        unicodedata.normalize('NFKC', s1) == unicodedata.normalize('NFKC', s2),
        unicodedata.normalize('NFD',  s1) == unicodedata.normalize('NFD',  s2),
        unicodedata.normalize('NFKD', s1) == unicodedata.normalize('NFKD', s2),
    )

print("string1 == string2:", tryEqualities(string1, string2))
print("string1 == string3:", tryEqualities(string1, string3))
print("string1 == string4:", tryEqualities(string1, string4))
print("string1 == string5:", tryEqualities(string1, string5))
print("string1 == string6:", tryEqualities(string1, string6))
print("string2 == string3:", tryEqualities(string2, string3))
print("string2 == string4:", tryEqualities(string2, string4))
print("string2 == string5:", tryEqualities(string2, string5))
print("string2 == string6:", tryEqualities(string2, string6))
print("string3 == string4:", tryEqualities(string3, string4))
print("string3 == string5:", tryEqualities(string3, string5))
print("string3 == string6:", tryEqualities(string3, string6))
print("string4 == string5:", tryEqualities(string4, string5))
print("string4 == string6:", tryEqualities(string4, string6))
print("string5 == string6:", tryEqualities(string5, string6))

input()

# len() returns the number of characters in a string
# and is unicode-aware so will correctly count the number of characters in a unicode string
# not jsut the number of bytes (like in C)
# encode('utf-8') can be used to get the raw bytes of a string
# which can be used to get the number of bytes in a string

print(len(string1), len(string1.encode('utf-8')))
print(len(string2), len(string2.encode('utf-8')))
print(len(string3), len(string3.encode('utf-8')))
print(len(string4), len(string4.encode('utf-8')))
print(len(string5), len(string5.encode('utf-8')))
print(len(string6), len(string6.encode('utf-8')))

        
            hello_unicode.c
        

        

    
#include <stdio.h>

int main(void) {
    printf("The unicode code point U+1F600 encodes in UTF-8\n");
    printf("as 4 bytes: 0xF0 0x9F 0x98 0x80\n");
    printf("We can output the 4 bytes like this: \xF0\x9F\x98\x80 (UTF-8)\n");
    printf("Or like this: ");
    putchar(0xF0);
    putchar(0x9F);
    putchar(0x98);
    putchar(0x80);
    putchar('\n');
    printf("Or like this: \U0001F600 (UTF-32)\n");
    // UNICODE code point less than 0x10000 (ie the BMP) can be encoded with
    // \uXXXX (lowercase u) with only 4 hex digits
    // \U must always be followed by 8 hex digits
}

        
            utf8_strlen.c
        

        

    
#include <stdio.h>
#include <string.h>
#include <assert.h>

unsigned long utf8_strlen(char *string) {
    unsigned long num_code_points = 0;
    for (char *code_point = string; *code_point;) {
        if ((*code_point & 0xF8) == 0xF0) {
            // 4-byte head byte
            code_point += 4;
        } else if ((*code_point & 0xF0) == 0xE0) {
            // 3-byte head byte
            code_point += 3;
        } else if ((*code_point & 0xE0) == 0xC0) {
            // 2-byte head byte
            code_point += 2;
        } else if ((*code_point & 0xC0) == 0x80) {
            // INVALID STRING
            // tail byte - should not be here
            // as we should be moving from head byte to head byte
            fprintf(stderr, "Invalid UTF-8 string: \"%s\"\n", string);
            fprintf(stderr, "Found a tail byte when head byte was expected\n");
            assert(0);
        } else if ((*code_point & 0x80) == 0x00) {
            // ASCII
            code_point += 1;
        } else {
            // INVALID STRING
            // this is not a valid UTF-8 byte
            fprintf(stderr, "Invalid UTF-8 string: \"%s\"\n", string);
            fprintf(stderr, "Head byte indicates invalid length\n");
            assert(0);
        }
        num_code_points++;
    }

    return num_code_points;
}

int main(void) {
    char *string1 = "Hello World";
    char *string2 = "Hellо Wоrld";
    char *string3 = "Hellⲟ W𐓪rld";
    char *string4 = "Ⓗⓔⓛⓛⓞ Ⓦⓞⓡⓛⓓ";
    char *string5 = "Hëllo World";
    char *string6 = "Hëllo World";

    printf("\"%s\": strlen=%lu, utf8_strlen=%lu\n", string1, strlen(string1), utf8_strlen(string1));
    printf("\"%s\": strlen=%lu, utf8_strlen=%lu\n", string2, strlen(string2), utf8_strlen(string2));
    printf("\"%s\": strlen=%lu, utf8_strlen=%lu\n", string3, strlen(string3), utf8_strlen(string3));
    printf("\"%s\": strlen=%lu, utf8_strlen=%lu\n", string4, strlen(string4), utf8_strlen(string4));
    printf("\"%s\": strlen=%lu, utf8_strlen=%lu\n", string5, strlen(string5), utf8_strlen(string5));
    printf("\"%s\": strlen=%lu, utf8_strlen=%lu\n", string6, strlen(string6), utf8_strlen(string6));
}

        
            utf8_encode.c
        

        

    
#include <stdio.h>
#include <stdint.h>

void print_utf8_encoding(uint32_t code_point) {
    uint8_t encoding[5] = {0};

    if (code_point < 0x80) {
        encoding[0] = code_point;
    } else if (code_point < 0x800) {
        encoding[0] = 0xC0 | (code_point >> 6);
        encoding[1] = 0x80 | (code_point & 0x3f);
    } else if (code_point < 0x10000) {
        encoding[0] = 0xE0 | (code_point >> 12);
        encoding[1] = 0x80 | ((code_point >> 6) & 0x3f);
        encoding[2] = 0x80 | (code_point  & 0x3f);
    } else if (code_point < 0x200000) {
        encoding[0] = 0xF0 | (code_point >> 18);
        encoding[1] = 0x80 | ((code_point >> 12) & 0x3f);
        encoding[2] = 0x80 | ((code_point >> 6)  & 0x3f);
        encoding[3] = 0x80 | (code_point  & 0x3f);
    }

    printf("UNICODE codepoint: U+%04x, UTF-32: 0x%08x, UTF-8: ", code_point, code_point);
    for (uint8_t *s = encoding; *s != 0; s++) {
        printf("0x%02x ", *s);
    }
    printf(" %s\n", encoding);
}

int main(void) {
    print_utf8_encoding(0x0042);
    print_utf8_encoding(0x00A2);
    print_utf8_encoding(0x10be);
    print_utf8_encoding(0x1F600);
}

        
            mojibake.py
        

        



    
Russian = """\
россия, москва, 119415
пр.Вернадского, 37,
к.1817-1,
жлетневой светлане
"""

print("with koi8_r encoding:")
print(Russian)

RussianBytes = Russian.encode('koi8_r')

print("raw bytes:")
for byte in RussianBytes.splitlines():
    print(byte)
print()


French = """\
ÒÏÓÓÉÑ, ÍÏÓË×Á, 119415
ÐÒ.÷ÅÒÎÁÄÓËÏÇÏ, 37,
Ë.1817-1,
ÖÌÅÔÎÅ×ÏÊ Ó×ÅÔÌÁÎÅ
"""

print("with latin-1 encoding:")
print(French)

FrenchBytes = French.encode('latin-1')

print("raw bytes:")
for byte in FrenchBytes.splitlines():
    print(byte)
print()

print(f"{(RussianBytes == FrenchBytes) = }")

        
            trojan_source.1.c
        

        

    
#include <stdio.h>
#include <string.h>

// most modern text editors will show you the invisible characters in this file
// because this file demonstrates a very real security vulnerability
// In vscode you can disable this by adding the following to your settings.json
//      "editor.renderWhitespace": "none",
//      "editor.unicodeHighlight.ambiguousCharacters": false,
//      "editor.unicodeHighlight.invisibleCharacters": false,
//      "editor.unicodeHighlight.nonBasicASCII": false,
//      "editor.renderControlCharacters": false,
// But you should immediately re-enable it after you have finished reading this file

int main(void) {

    // This variable doesn't look like it is valid C
    // As variable must be single words
    // replacing the space with an underscore `_` would make it valid

    // EXCEPT that the space is not a space
    // it's the "HALFWIDTH HANGUL FILLER" character
    // the HALFWIDTH HANGUL FILLER is rather special
    // as it is one of very few characters that are invisible
    // ie don't have a visible glyph
    // yet isn't a whitespace character (has the WhiteSpace=yes UNICODE property)
    // This means that C allows it to be used in variable names

    int helloﾠworld = 20;
    printf("helloﾠworld = %d\n", helloﾠworld);

    // FULLWIDTH HYPHEN-MINUS can be used to make a variable name
    // that looks like they have a hyphen in them

    int hello－world = 20;
    printf("hello－world = %d\n", hello－world);

    // Please never actually use these characters in variable names

    return 0;
}

        
            trojan_source.2.c
        

        

    
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include <stdbool.h>

// most modern text editors will show you the invisible characters in this file
// because this file demonstrates a very real security vulnerability
// In vscode you can disable this by adding the following to your settings.json
//      "editor.renderWhitespace": "none",
//      "editor.unicodeHighlight.ambiguousCharacters": false,
//      "editor.unicodeHighlight.invisibleCharacters": false,
//      "editor.unicodeHighlight.nonBasicASCII": false,
//      "editor.renderControlCharacters": false,
// But you should immediately re-enable it after you have finished reading this file


bool strings_are_equal(char *s1, char *s2) {
    // To see what this function is actually doing
    // uncomment the following line:
    // printf("Comparing \"%s\" and \"%s\"\n", s1, s2);
    return strcmp(s1, s2) == 0;
}

int main(int argc, char *argv[]) {
    if (argc != 2)
    {
        printf("Usage: %s <access_level>\n", argv[0]);
        printf("Access levels:\n");
        printf("0: user, 1: admin\n");
        return 1;
    }

    char *access_levels[] = {
        "user",
        "admin"
    };

    char *endptr;
    long selected_access_level = strtol(argv[1], &endptr, 0);

    if (*endptr != '\0')
    {
        printf("Invalid access level: %s\n", argv[1]);
        printf("Access levels:\n");
        printf("0: user, 1: admin\n");
        return 1;
    }

    if (selected_access_level < 0 || selected_access_level > 1)
    {
        printf("Invalid access level: %ld\n", selected_access_level);
        printf("Access levels:\n");
        printf("0: user, 1: admin\n");
        return 1;
    }

    char *current_access_level = access_levels[selected_access_level];

    printf("Current access level: %s\n", current_access_level);

    // with invisible characters not visible
    // this next line will likely look like:
    //      if (!strings_are_equal(current_access_level, "user")) // Check if admin
    // But that's not actully what it is
    // the real code is:
    //      if (!strings_are_equal(current_access_level, "user <invisible-characters>// Check if admin<invisible-characters>"))
    // The invisible characters here are:
    //      the RIGHT-TO-LEFT OVERRIDE (U+202E),
    //      the LEFT-TO-RIGHT ISOLATE (U+2066),
    //      the POP DIRECTIONAL ISOLATE (U+2069),
    // In that order
    // These characters tell the text editor to render the text in a forced direction
    // Instead of the normal direction of the text (left-to-right in the western world, right-to-left in the arabic world)
    // Forcing the text to be rendered in a different direction is a common way to make text appear to be something it is not
    // Like in tis case where what looks like a comment is actually part of the string
    if (!strings_are_equal(current_access_level, "user‮ ⁦// Check if admin⁩ ⁦"))
    {
        printf("Hello admin.\n");
    }
    else
    {
        printf("Hello user.\n");
    }

    return 0;
}

        
            trojan_source.3.c
        

        

    
#include <stdio.h>
#include <string.h>

// most modern text editors will show you the invisible characters in this file
// because this file demonstrates a very real security vulnerability
// In vscode you can disable this by adding the following to your settings.json
//      "editor.renderWhitespace": "none",
//      "editor.unicodeHighlight.ambiguousCharacters": false,
//      "editor.unicodeHighlight.invisibleCharacters": false,
//      "editor.unicodeHighlight.nonBasicASCII": false,
//      "editor.renderControlCharacters": false,
// But you should immediately re-enable it after you have finished reading this file

int main(void) {
    // These three variables look to be all the same
    // But they are not, they are all different variables
    // Each variable uses a different unicode character for the `o`
    // The first variable uses the "latin small letter o" (U+006F)
    // The second variable uses the "armenian small letter o" (U+0585)
    // The third variable uses the "cyrillic small letter o" (U+043E)
    // They all look the same, but the C compiler treats them as different variables

    int total;        // latin
    int tօtal;        // armenian
    int tоtal = -666; // cyrillic

    total = 100; // latin
    printf("total = %d\n", total); // latin

    tօtal = 999; // armenian
    printf("total = %d\n", total); // latin

    total = tօtal; // latin = armenian
    printf("total = %d\n", tоtal); // cyrillic

    printf("total = %d\n", tօtal); // armenian

    return 0;
}