#!/bin/dash # written by andrewt@unsw.edu.au for COMP(2041|9044) # Improved version of plagiarism_detection.reordering.sh # Note use sha256sum to calculate a Cryptographic hash of the modified file # https://en.wikipedia.org/wiki/SHA-2 # and use of sort && uniq to find files with the same hash # This allows execution time linear in the number of files # We could use a faster less secure hashing function instead of sha2 sha2hash() { sed ' s/\/\/.*// s/"[^"]"/s/g s/[a-zA-Z_][a-zA-Z0-9_]*/v/g ' $1| sort| sha256sum } for file in "$@" do echo "$(sha2hash $file) $file" done| sort| uniq -w32 -d --all-repeated=separate