s4g/build.sh

#!/usr/bin/env bash

# TODO: Add a comment block at the top of the script, describing what it does.
# TODO: Add a '-h' or '--help' flag, that displays information about the script, and how to use it.


#  s4g - Stupid Simple Static-Site Generator
# ---------------------------------------------
#
#  This is a static-site generator, that can be used to convert Markdown files into HTML.
#  It is extremely simple and extremely opinionated, as you can see if you read the code below.
#  A lot of paths and directories are hard-coded, to suit my workflow.
#
#  To use it, create a directory for your project (e.g. 'website'). Inside 'website', create
#  two directories: 'source' (which holds your Markdown files) and 'output' (which holds the
#  converted HTML. To exclude files from the conversion process, place them inside a directory
#  named 'exclude' inside 'source'. This directory will not be copied over into 'output', and
#  any files inside it will not be converted.
#
#  In addition to these directories, three files are needed in 'website':
#  1. 'header.html' - A header, which is prepended to every source file. Unfortunately, this must
#		      be written in regular HTML.
#  2. 'footer.html' - A footer, which is appended to every source file. Also must be written in HTML.
#  3. 'styles.css' - A global stylesheet.
#
# The script takes in a parameter, which is the directory that contains the 'source' and 'output' folders.
#
# If you have any comments or questions, please email aadhavan@twomorecents.org.


set -o errexit # Stop executing when a command fails
set -o nounset # Stop executing when accessing an unset variable
set -o pipefail # Treat a pipeline as failing, even if one command in the pipeline fails

if [[ "${TRACE-0}" == "1" ]]; then set -o xtrace; fi # Enable tracing (output of each command) if the TRACE variable is set


if [ "$#" -ne 1 ]; then
	echo "ERROR: Invalid number of paramters. Read script for more details."
	exit
fi
BASE_PATH=$(cd "$1"; pwd)

check_for_dirs() {
	if [[ ! -d "${BASE_PATH}/source" ]]; then
		echo "ERROR: 'source' folder does not exist. Your content is sourced from this folder." >&2
		exit
	fi

	if [[ -d "${BASE_PATH}/temp" ]]; then
		echo "ERROR: You have an existing 'temp' folder. Please delete this folder, and run the script again." >&2
		exit
	fi

	if [[ ! -f "${BASE_PATH}/header.html" ]]; then
		echo "ERROR: You do not have a header.html file. This file is used as a global header. Please create this file, and run the script again." >&2
		exit
	fi
	if [[ ! -f "${BASE_PATH}/footer.html" ]]; then
		echo "ERROR: You do not have a footer.html file. This file is used as a global footer. Please create this file, and run the script again." >&2
		exit
	fi

}


setup_temp_dir() {
#	Check if 'temp' already exists
	mkdir "${BASE_PATH}/temp"
}

setup_output_dir() {
	rm -rf "${BASE_PATH}/output" # Delete existing 'output' directory
	cp -r "${BASE_PATH}/source" "${BASE_PATH}/output" #Copy directory structure from 'source' to 'output'
}

del_files_in_output() {
	find "$BASE_PATH/output" -type f -name "*.md" -delete #Delete all .md files (which were copied over from 'source') in 'output'
	# Delete the 'exclude' directory from the output folder.
	# This folder contains markdown files which shouldn't be converted to HTML.
	if [[ -d "${BASE_PATH}/output/exclude" ]]; then
		rm -r "${BASE_PATH}/output/exclude"
	fi
}

read_metadata() {
#	Read the metadata from the top of a .md file into a string
	metadata=$(awk 'BEGIN{RS = "\n\n"} {print $0}; {exit}' "$1")  # Reads from the .md file until a double-newline is encountered
}

convert_to_array() {
	local meta_key
	local meta_value

#	Converts the metadata into two arrays: one with the key, and the other with the value.
	readarray -t meta_key < <(echo -e "$1" | awk -F: '{print $1}')
	readarray -t meta_value < <(echo -e "$1" | awk -F: '{st = index($0,":"); values = substr($0,st+1); print values}' | cut -c 2-)

#	Merge both arrays into an associative array
	declare -Ag  meta_array
	for index in $(seq 0 `expr "${#meta_key[@]}" - 1`); do
		meta_array["${meta_key[$index]}"]="${meta_value[$index]}"
	done
}

add_date_to_array() {
	if [[ "${meta_array[date]-}" == "auto" ]]; then # If the date is set to 'auto'
		meta_array["date"]="$(date -r $1 +'%b %d, %Y')"
	fi
}

add_header_and_footer() {
#	Copy header to temporary location - 'parent_dir' is used to ensure that
#	each temporary header is in its own directory
	cp "$BASE_PATH/header.html" "$BASE_PATH/temp/$parent_dir/temp_header.html"

#	Check for relevant metadata, and perform corresponding action
	# This syntax is intended (although it doesn't follow typical Bash array syntax). See https://stackoverflow.com/a/45385463 for more info.
	if [[ ! -v "meta_array[date]" ]]; then # If there is no date
		sed -i '$ d' "$BASE_PATH/temp/$parent_dir/temp_header.html" # remove the 'date published' section of the header
	fi

	if [[ "${meta_array[noappend]-}" == "true" ]]; then
		sed -i 's/ - Two More Cents//g' "$BASE_PATH/temp/$parent_dir/temp_header.html" # 'noappend' removes the suffix from the title
	fi

#	Add header
	cat "$BASE_PATH/temp/$parent_dir/temp_header.html" | cat - "$1" > "$BASE_PATH/temp/$parent_dir/temp.html"

#	Add footer
	echo >> "$BASE_PATH/temp/$parent_dir/temp.html" # Add newline
	cat "$BASE_PATH/footer.html" >> "$BASE_PATH/temp/$parent_dir/temp.html"

#	Move temp file to original location
	mv "$BASE_PATH/temp/$parent_dir/temp.html" "$1"
}

add_header_and_footer_to_index() {
	mkdir "$BASE_PATH/temp/index_page"
#	Add header
	cat "$BASE_PATH/header.html" | head -n -1 | cat - "$1" > "$BASE_PATH/temp/index_page/temp.html" # For the index page, remove the last line of the header (date published)

#	Add footer
 	echo >> "$BASE_PATH/temp/index_page/temp.html" # Add newline
	cat "$BASE_PATH/footer.html" >> "$BASE_PATH/temp/index_page/temp.html"

#	Move temp file to original location
	mv "$BASE_PATH/temp/index_page/temp.html" "$1"
}

replace_vars() {
#	Loop through the keys of the 'meta_array' array, search for all occurences of the key in the HTML doc, and replace them with the corresponding value..
	for arr_key in "${!meta_array[@]}"; do
		meta_array["$arr_key"]="${meta_array["$arr_key"]//\//\\/}" # Escape all forward slashes in the value
		sed -i "s/[\$][\$]$arr_key[\$][\$]/${meta_array[$arr_key]}/g" "$1"
	done

}

convert_file() {
#	Helper function for md_to_html(). It takes in the file to convert as an argument,
#	and converts that file.
	file_to_conv="$1"
	echo "Converting $file_to_conv"

	read_metadata "$file_to_conv" # Sets the 'metadata' variable

#	Generate a random 8-character alphabetic string, until we find
#	one that doesn't exist in the 'temp' directory. This string
#	will serve as the parent directory of our file.
	parent_dir="$(tr -dc A-Za-z </dev/urandom | head -c 8; echo)"
	while ls -r "$BASE_PATH"/temp | grep -q "$parent_dir" ; do
		parent_dir="$(tr -dc A-Za-z </dev/urandom | head -c 8; echo)"
	done

#	Copy file to temp dir and strip metadata
	mkdir -p "$BASE_PATH/temp/$parent_dir/"
	cp "$file_to_conv" "$BASE_PATH/temp/$parent_dir/"
	let num_lines=$(echo "$metadata" | wc -l)+1
	sed -i "1,${num_lines}d" "$BASE_PATH/temp/$parent_dir/$(basename "$file_to_conv")"

#	Construct path for output file
	local path_for_output=$(realpath --relative-to="${BASE_PATH}/source" "$file_to_conv")
	path_for_output="${BASE_PATH}/output/${path_for_output}"
	path_for_output="$(dirname $path_for_output)/$(basename $path_for_output .md).html"

#	Convert the file (using the given filters), and place the output in the correct location.
	pandoc --lua-filter "$BASE_PATH"/pandoc_filters/* -f markdown --wrap=preserve "$BASE_PATH/temp/$parent_dir/$(basename "$file_to_conv")" > "${path_for_output}"

	convert_to_array "$metadata" #Sets the 'meta_array' array

	add_date_to_array "$file_to_conv" #Uses 'meta_array' array
	add_header_and_footer "$path_for_output" # Uses 'meta_array' array and 'parent_dir' variable
	replace_vars "$path_for_output" #Uses 'meta_array' array

	unset metadata meta_key meta_value meta_array path_for_output

}

md_to_html() {
#	Convert .md files from 'source' and place them into the correct locations into 'output'
#	Exclude all files and folders inside the 'exclude' directory
	local files=$(find "${BASE_PATH}/source" -not -path "${BASE_PATH}/source/exclude/*"  -name "*.md")

#	Concurrently convert each document
	for file in $files; do
		(convert_file "$file") &
		unset metadata path_for_output
	done

#	Wait for all documents to finish converting, then remove all temporary files.
	wait
	rm -rf "$BASE_PATH"/temp/*
}

gen_sorted_file_list() { # Generate a list of the HTMl files, sorted by when they were last modified (read from the contents of the HTML file)
	local files=$(find "$BASE_PATH/output" -name "*.html")
	local date_mod

	for file in $files; do
		if grep -q "date-published" "$file" ; then
			echo "$file" >> "$BASE_PATH/temp/file_listing.txt" # Write files that have a date published to a temp file (we only want the files with date modified, because only these files can be listed with their date on the site map)

			date_mod+=$(cat "$file" | grep "date-published" | awk -F'[<>]' '{print $3}' \
				| cut -d' ' -f '1,2' --complement | tr -d "," | awk '{print $2" "$1" "$3}' \
				| date -f - +"%s")
#			Explanation:
#			Line 1 extracts the published date from the HTML file
#			Line 2 re-arranges this information, and converts it into DD MM YY format
#			Line 3 converts this into a UNIX timestamp

			date_mod+=$'\n'
		fi
	done

	date_mod=$(echo "${date_mod-}" | head -n -1) # Remove last (empty) line from variable
	echo "${date_mod-}" > "$BASE_PATH/temp/date_mod.txt" # Write the corresponding 'date modified' timestamps to a temp file

	paste "$BASE_PATH/temp/file_listing.txt" "$BASE_PATH/temp/date_mod.txt" > "$BASE_PATH/temp/new_file_list.txt" # Combine file list and date modified into a single file

	sorted_file_list=$(sort -r -k 2 "$BASE_PATH/temp/new_file_list.txt") # Sort the data in the file based on the timestamp (from newest to oldest), and store it into a variable
	sorted_file_list=$(echo "$sorted_file_list" | awk '{print $1}') # Store only the first column (the file path) in the variable
}

gen_rss_feed() { # Uses the sorted_file_list variable to generate an RSS feed
	echo "Generating RSS Feed..."
	local RSS_FEED_PATH="${BASE_PATH}/output/rss.xml"
	touch "$RSS_FEED_PATH" # Create the RSS file
	local RSS_CONTENT="<rss version=\"2.0\">\n"
	counter=0
	RSS_CONTENT+="<channel>\n"
	RSS_CONTENT+="<title>Two More Cents</title>\n"
	RSS_CONTENT+="<link>http://twomorecents.org/</link>\n"
	RSS_CONTENT+="<description>The personal website of Aadhavan Srinivasan.</description>\n"
	RSS_CONTENT+="<language>en-us</language>\n"
	RSS_CONTENT+="<lastBuildDate>$(date -R)</lastBuildDate>\n"
	RSS_CONTENT+="<generator>s4g - Stupid Simple Static Site Generator</generator>\n"

	for file in $1; do
		if [ $counter -gt 9 ]; then
			break
		fi
		RSS_CONTENT+="<item>\n"
		RSS_CONTENT+="<title>\n"
		RSS_CONTENT+=$(cat "$file" | grep "<title>" | head -n 1 | awk -F'[<>]' '{print $3}')$'\n'
		RSS_CONTENT+="</title>\n"
		RSS_CONTENT+="<link>\n"
		RSS_CONTENT+="https://twomorecents.org/"
		RSS_CONTENT+=$(realpath --relative-to="${BASE_PATH}/output" "$file")
		RSS_CONTENT+="</link>\n"
		RSS_CONTENT+="</item>\n"
		((++counter))
	done

	RSS_CONTENT+="</channel>\n</rss>"

	echo -e "$RSS_CONTENT" > $RSS_FEED_PATH

}

gen_index_page() { # Generate an index page (site map) that includes links to the other pages

	echo "Generating index page..."

	local index_file_html="<nav class=\"toc\">"$'\n' # Variable to store the body HTML of the index page, enclose the list in a nav
	index_file_html+="<p>(All dates are in MM/DD/YYYY format)</p>"$'\n'

	for file in $1; do
		local title=$(cat "$file" | grep "<title>" | head -n 1 | awk -F'[<>]' '{print $3}') # Find the title of the web page
		local suffix=" - Two More Cents"
		title="${title%"$suffix"}" # Remove the website name from it

		local pub_date=$(cat "$file" | grep "date-published" | head -n 1 | awk -F'[<>]' '{print $3}') # Find the date published
		prefix="Published on " # Find date published of webpage
		pub_date="${pub_date#"$prefix"}" # Remove the prefix from it
		pub_date=$(echo "$pub_date" | tr -d "," | awk '{print $2" "$1" "$3}' | date -f - +"%m/%d/%Y") # Re-arrange the date and convert to mm/dd/yy

		local file_path=$(realpath --relative-to="${BASE_PATH}/output" "$file")

		index_file_html+="<li><time>${pub_date}</time> - <a href=\"$file_path\">$title</a></li>" # Add a line of HTML containing the date and title of the article
		index_file_html+=$'\n'
	done

	index_file_html=$(echo "$index_file_html" | head -n -1) # Remove last (empty) line from variable
	index_file_html+="</nav>"

	path_for_output="${BASE_PATH}/output/site-map.html"
	echo "$index_file_html" > "$path_for_output" # Output variable to file

	add_header_and_footer_to_index "$path_for_output" # Add header and footer to index file
	sed -i 's/[\$][\$]title[\$][\$]/Site Map/g' "$path_for_output" # Replace title variable with 'site map' title

}

copy_things_in() {
	cp "${BASE_PATH}/styles.css" "${BASE_PATH}/output/"
	cp -r "${BASE_PATH}/files" "${BASE_PATH}/output/"
	cp -r "${BASE_PATH}/fonts" "${BASE_PATH}/output/"

}

clean_up() {
	rm -r "${BASE_PATH}/temp"
}


check_for_dirs
setup_temp_dir
setup_output_dir
del_files_in_output
md_to_html
gen_sorted_file_list # Sets the 'sorted_file_list' variable
gen_rss_feed "$sorted_file_list" # Uses the 'sorted_file_list' variable
gen_index_page "$sorted_file_list" # Uses the 'sorted_file_list' variable
copy_things_in
clean_up