Skip newlines and YAML metadata at the beginning of a markdown file

Allow files to be placed inside 'drafts' inside 'source'; will be converted to HTML, but not shown on sitemap
Concatenate all lua filters into 1 file, then provide that file to 'pandoc'
2025-04-19 11:28:56 -04:00 · 2025-02-16 15:36:04 -05:00 · 2025-02-15 14:21:47 -05:00 · 2025-01-07 12:07:34 -06:00 · 2024-10-11 10:52:27 -04:00 · 2024-10-11 10:33:18 -04:00
1 changed files with 168 additions and 55 deletions
--- a/build.sh
+++ b/build.sh
@@ -1,5 +1,39 @@
 #!/usr/bin/env bash
 # TODO: Add a comment block at the top of the script, describing what it does.
 # TODO: Add a '-h' or '--help' flag, that displays information about the script, and how to use it.
 #  s4g - Stupid Simple Static-Site Generator
 # ---------------------------------------------
 #
 #  This is a static-site generator, that can be used to convert Markdown files into HTML.
 #  It is extremely simple and extremely opinionated, as you can see if you read the code below.
 #  A lot of paths and directories are hard-coded, to suit my workflow.
 #
 #  To use it, create a directory for your project (e.g. 'website'). Inside 'website', create
 #  two directories: 'source' (which holds your Markdown files) and 'output' (which holds the
 #  converted HTML. 
 #
 #  To exclude files from the conversion process, place them inside a directory
 #  named 'exclude' inside 'source'. This directory will not be copied over into 'output', and 
 #  any files inside it will not be converted.
 #
 #  If you want a file to be converted, but not displayed in the sitemap, then place it inside a directory
 #  called 'drafts' inside 'source'. The converted file will still be accessible via its path, but it will not
 #  be shown on the sitemap.
 #
 #  In addition to these directories, three files are needed in 'website':
 #  1. 'header.html' - A header, which is prepended to every source file. Unfortunately, this must
 #		      be written in regular HTML.
 #  2. 'footer.html' - A footer, which is appended to every source file. Also must be written in HTML.
 #  3. 'styles.css' - A global stylesheet.
 #
 # The script takes in a parameter, which is the directory that contains the 'source' and 'output' folders.
 #
 # If you have any comments or questions, please email aadhavan@twomorecents.org.
 set -o errexit # Stop executing when a command fails
 set -o nounset # Stop executing when accessing an unset variable
 set -o pipefail # Treat a pipeline as failing, even if one command in the pipeline fails
@@ -7,8 +41,11 @@ set -o pipefail # Treat a pipeline as failing, even if one command in the pipeli
 if [[ "${TRACE-0}" == "1" ]]; then set -o xtrace; fi # Enable tracing (output of each command) if the TRACE variable is set
-
+if [ "$#" -ne 1 ]; then
-BASE_PATH="$(dirname "$0")"
+	echo "ERROR: Invalid number of paramters. Read script for more details."
 	exit
 fi
 BASE_PATH=$(cd "$1"; pwd)
 check_for_dirs() {
 	if [[ ! -d "${BASE_PATH}/source" ]]; then
@@ -39,20 +76,28 @@ setup_temp_dir() {
 }
 setup_output_dir() {
-	rm -r "${BASE_PATH}/output" # Delete existing 'output' directory
+	rm -rf "${BASE_PATH}/output" # Delete existing 'output' directory
 	cp -r "${BASE_PATH}/source" "${BASE_PATH}/output" #Copy directory structure from 'source' to 'output'
 }
 del_files_in_output() {
 	find "$BASE_PATH/output" -type f -name "*.md" -delete #Delete all .md files (which were copied over from 'source') in 'output'
 	# Delete the 'exclude' directory from the output folder.
 	# This folder contains markdown files which shouldn't be converted to HTML.
 	if [[ -d "${BASE_PATH}/output/exclude" ]]; then
 		rm -r "${BASE_PATH}/output/exclude"
 	fi
 }
 read_metadata() {
 #	Read the metadata from the top of a .md file into a string
-	metadata=$(awk 'BEGIN{RS = "\n\n"} {print $0}; {exit}' "$1")  # Reads from the .md file until a double-newline is encountered
+	metadata=$(awk 'BEGIN{RS = "\n\n"} ($0 != "") && ($0 !~ /^---/) {print $0; exit}' "$1")  # Reads from the .md file until a double-newline is encountered, as long as 1) the text read is non-empty and 2) it doesn't start with a triple hyphen (the triple hyphen denotes a pandoc metadata block)
 }
 convert_to_array() {
 	local meta_key
 	local meta_value
 #	Converts the metadata into two arrays: one with the key, and the other with the value.
 	readarray -t meta_key < <(echo -e "$1" | awk -F: '{print $1}')
 	readarray -t meta_value < <(echo -e "$1" | awk -F: '{st = index($0,":"); values = substr($0,st+1); print values}' | cut -c 2-)
@@ -71,85 +116,113 @@ add_date_to_array() {
 }
 add_header_and_footer() {
-#	Copy header to temporary location
+#	Copy header to temporary location - 'parent_dir' is used to ensure that
-	cp "$BASE_PATH/header.html" "$BASE_PATH/temp/temp_header.html"
+#	each temporary header is in its own directory
 	cp "$BASE_PATH/header.html" "$BASE_PATH/temp/$parent_dir/temp_header.html"
 #	Check for relevant metadata, and perform corresponding action
-	if [[ ! -v "meta_array["date"]" ]]; then # If there is no date
+	# This syntax is intended (although it doesn't follow typical Bash array syntax). See https://stackoverflow.com/a/45385463 for more info.
-		sed -i '$ d' "$BASE_PATH/temp/temp_header.html" # remove the 'date published' section of the header
+	if [[ ! -v "meta_array[date]" ]]; then # If there is no date
 		sed -i '$ d' "$BASE_PATH/temp/$parent_dir/temp_header.html" # remove the 'date published' section of the header
 	fi
 	if [[ "${meta_array[noappend]-}" == "true" ]]; then
-		sed -i 's/ - Two More Cents//g' "$BASE_PATH/temp/temp_header.html" # 'noappend' removes the suffix from the title
+		sed -i 's/ - Two More Cents//g' "$BASE_PATH/temp/$parent_dir/temp_header.html" # 'noappend' removes the suffix from the title
 	fi
 #	Add header
-	cat "$BASE_PATH/temp/temp_header.html" | cat - "$1" > "$BASE_PATH/temp/temp.html"
+	cat "$BASE_PATH/temp/$parent_dir/temp_header.html" | cat - "$1" > "$BASE_PATH/temp/$parent_dir/temp.html"
 #	Add footer
-	echo >> "$BASE_PATH/temp/temp.html" # Add newline
+	echo >> "$BASE_PATH/temp/$parent_dir/temp.html" # Add newline
-	cat "$BASE_PATH/footer.html" >> "$BASE_PATH/temp/temp.html"
+	cat "$BASE_PATH/footer.html" >> "$BASE_PATH/temp/$parent_dir/temp.html"
 #	Move temp file to original location
-	mv "$BASE_PATH/temp/temp.html" "$1"
+	mv "$BASE_PATH/temp/$parent_dir/temp.html" "$1"
 }
 add_header_and_footer_to_index() {
 	mkdir "$BASE_PATH/temp/index_page"
 #	Add header
-	cat "$BASE_PATH/header.html" | head -n -1 | cat - "$1" > "$BASE_PATH/temp/temp.html" # For the index page, remove the last line of the header (date published)
+	cat "$BASE_PATH/header.html" | head -n -1 | cat - "$1" > "$BASE_PATH/temp/index_page/temp.html" # For the index page, remove the last line of the header (date published)
 #	Add footer
- 	echo >> "$BASE_PATH/temp/temp.html" # Add newline
+ 	echo >> "$BASE_PATH/temp/index_page/temp.html" # Add newline
-	cat "$BASE_PATH/footer.html" >> "$BASE_PATH/temp/temp.html"
+	cat "$BASE_PATH/footer.html" >> "$BASE_PATH/temp/index_page/temp.html"
 #	Move temp file to original location
-	mv "$BASE_PATH/temp/temp.html" "$1"
+	mv "$BASE_PATH/temp/index_page/temp.html" "$1"
 }
 replace_vars() {
 #	Loop through the keys of the 'meta_array' array, search for all occurences of the key in the HTML doc, and replace them with the corresponding value..
 	for arr_key in "${!meta_array[@]}"; do
-		meta_array["$arr_key"]="${meta_array["$arr_key"]/\//\\/}" # Escape all forward slashes in the value
+		meta_array["$arr_key"]="${meta_array["$arr_key"]//\//\\/}" # Escape all forward slashes in the value
 		sed -i "s/[\$][\$]$arr_key[\$][\$]/${meta_array[$arr_key]}/g" "$1"
 	done
 }
-md_to_html() {
+convert_file() {
-#	Convert .md files from 'source' and place them into the correct locations into 'output'
+#	Helper function for md_to_html(). It takes in the file to convert as an argument,
 #	and converts that file.
 	file_to_conv="$1"
 	echo "Converting $file_to_conv"
-	files=$(find "$BASE_PATH/source" -name "*.md")
+	read_metadata "$file_to_conv" # Sets the 'metadata' variable
-	for file in $files; do
+#	Generate a random 8-character alphabetic string, until we find
-		read_metadata "$file" # Sets the 'metadata' variable
+#	one that doesn't exist in the 'temp' directory. This string
-
+#	will serve as the parent directory of our file.
-		convert_to_array "$metadata" #Sets the 'meta_array' array
+	parent_dir="$(tr -dc A-Za-z </dev/urandom | head -c 8; echo)"
-		add_date_to_array "$file" #Uses 'meta_array' array
+	while ls -r "$BASE_PATH"/temp | grep -q "$parent_dir" ; do
-
+		parent_dir="$(tr -dc A-Za-z </dev/urandom | head -c 8; echo)"
 #		Copy file to temp dir and strip metadata
 		cp "$file" "$BASE_PATH/temp/"
 		let num_lines=$(echo "$metadata" | wc -l)+1
 		sed -i "1,${num_lines}d" "$BASE_PATH/temp/$(basename "$file")"
 #		Construct path for output file
 		path_for_output=$(realpath --relative-to="${BASE_PATH}/source" "$file")
 		path_for_output="${BASE_PATH}/output/${path_for_output}"
 		path_for_output="$(dirname $path_for_output)/$(basename $path_for_output .md).html"
 #		Convert the file (using the given filters), and place the output in the correct location
 		pandoc --lua-filter "$BASE_PATH"/pandoc_filters/* -f markdown --wrap=preserve "$BASE_PATH/temp/$(basename "$file")" > "${path_for_output}"
 		rm "$BASE_PATH"/temp/*
 		add_header_and_footer "$path_for_output" # Uses 'meta_array' array
 		replace_vars "$path_for_output" #Uses 'meta_array' array
 		unset metadata meta_key meta_value meta_array
 	done
 #	Copy file to temp dir and strip metadata
 	mkdir -p "$BASE_PATH/temp/$parent_dir/"
 	cp "$file_to_conv" "$BASE_PATH/temp/$parent_dir/"
 	let num_lines=$(echo "$metadata" | wc -l)+1
 	sed -i "1,${num_lines}d" "$BASE_PATH/temp/$parent_dir/$(basename "$file_to_conv")"
 #	Construct path for output file
 	local path_for_output=$(realpath --relative-to="${BASE_PATH}/source" "$file_to_conv")
 	path_for_output="${BASE_PATH}/output/${path_for_output}"
 	path_for_output="$(dirname $path_for_output)/$(basename $path_for_output .md).html"
 #	Get the combined contents of all lua filters
 	all_lua_filters="$(cat $BASE_PATH/pandoc_filters/*.lua)"
 #	Convert the file (using the given filters), and place the output in the correct location.
 	pandoc --lua-filter <(echo "$all_lua_filters") -f markdown --wrap=preserve "$BASE_PATH/temp/$parent_dir/$(basename "$file_to_conv")" > "${path_for_output}"
 	convert_to_array "$metadata" #Sets the 'meta_array' array
 	add_date_to_array "$file_to_conv" #Uses 'meta_array' array
 	add_header_and_footer "$path_for_output" # Uses 'meta_array' array and 'parent_dir' variable
 	replace_vars "$path_for_output" #Uses 'meta_array' array
 	unset metadata meta_key meta_value meta_array path_for_output
 }
 md_to_html() {
 #	Convert .md files from 'source' and place them into the correct locations into 'output'
 #	Exclude all files and folders inside the 'exclude' directory
 	local files=$(find "${BASE_PATH}/source" -not -path "${BASE_PATH}/source/exclude/*"  -name "*.md")
-gen_sorted_file_list() { # Generate a list of the HTMl files, sorted by when they were last modified (read from the contents of the HTML file)
+#	Concurrently convert each document
-	files=$(find "$BASE_PATH/output" -name "*.html")
+	for file in $files; do
 		(convert_file "$file") &
 		unset metadata path_for_output
 	done
 #	Wait for all documents to finish converting, then remove all temporary files.
 	wait
 	rm -rf "$BASE_PATH"/temp/*
 }
 gen_sorted_file_list() { # Generate a list of the HTMl files, sorted by when they were last modified (read from the contents of the HTML file). Exclude all files in the 'drafts' directory.
 	local files=$(find "$BASE_PATH/output" -name "*.html" -not -path "${BASE_PATH}/output/drafts/*")
 	local date_mod
 	for file in $files; do
@@ -177,21 +250,60 @@ gen_sorted_file_list() { # Generate a list of the HTMl files, sorted by when the
 	sorted_file_list=$(echo "$sorted_file_list" | awk '{print $1}') # Store only the first column (the file path) in the variable
 }
-gen_index_page() { # Generate an index page (site map) that includes links to the other pages
+gen_rss_feed() { # Uses the sorted_file_list variable to generate an RSS feed
-
+	echo "Generating RSS Feed..."
-	index_file_html="<nav class=\"toc\">"$'\n' # Variable to store the body HTML of the index page; enclose the list in a nav
+	local RSS_FEED_PATH="${BASE_PATH}/output/rss.xml"
 	touch "$RSS_FEED_PATH" # Create the RSS file
 	local RSS_CONTENT="<rss version=\"2.0\">\n"
 	counter=0
 	RSS_CONTENT+="<channel>\n"
 	RSS_CONTENT+="<title>Two More Cents</title>\n"
 	RSS_CONTENT+="<link>http://twomorecents.org/</link>\n"
 	RSS_CONTENT+="<description>The personal website of Aadhavan Srinivasan.</description>\n"
 	RSS_CONTENT+="<language>en-us</language>\n"
 	RSS_CONTENT+="<lastBuildDate>$(date -R)</lastBuildDate>\n"
 	RSS_CONTENT+="<generator>s4g - Stupid Simple Static Site Generator</generator>\n"
 	for file in $1; do
-		title=$(cat "$file" | grep "<title>" | head -n 1 | awk -F'[<>]' '{print $3}') # Find the title of the web page
+		if [ $counter -gt 9 ]; then
-		suffix=" - Two More Cents"
+			break
 		fi
 		RSS_CONTENT+="<item>\n"
 		RSS_CONTENT+="<title>\n"
 		RSS_CONTENT+=$(cat "$file" | grep "<title>" | head -n 1 | awk -F'[<>]' '{print $3}')$'\n'
 		RSS_CONTENT+="</title>\n"
 		RSS_CONTENT+="<link>\n"
 		RSS_CONTENT+="https://twomorecents.org/"
 		RSS_CONTENT+=$(realpath --relative-to="${BASE_PATH}/output" "$file")
 		RSS_CONTENT+="</link>\n"
 		RSS_CONTENT+="</item>\n"
 		((++counter))
 	done
 	RSS_CONTENT+="</channel>\n</rss>"
 	echo -e "$RSS_CONTENT" > $RSS_FEED_PATH
 }		
 gen_index_page() { # Generate an index page (site map) that includes links to the other pages
 	echo "Generating index page..."
 	local index_file_html="<nav class=\"toc\">"$'\n' # Variable to store the body HTML of the index page, enclose the list in a nav
 	index_file_html+="<p>(All dates are in MM/DD/YYYY format)</p>"$'\n'
 	for file in $1; do
 		local title=$(cat "$file" | grep "<title>" | head -n 1 | awk -F'[<>]' '{print $3}') # Find the title of the web page
 		local suffix=" - Two More Cents"
 		title="${title%"$suffix"}" # Remove the website name from it
-		pub_date=$(cat "$file" | grep "date-published" | head -n 1 | awk -F'[<>]' '{print $3}') # Find the date published
+		local pub_date=$(cat "$file" | grep "date-published" | head -n 1 | awk -F'[<>]' '{print $3}') # Find the date published
 		prefix="Published on " # Find date published of webpage
 		pub_date="${pub_date#"$prefix"}" # Remove the prefix from it
 		pub_date=$(echo "$pub_date" | tr -d "," | awk '{print $2" "$1" "$3}' | date -f - +"%m/%d/%Y") # Re-arrange the date and convert to mm/dd/yy
-		file_path=$(realpath --relative-to="${BASE_PATH}/output" "$file")
+		local file_path=$(realpath --relative-to="${BASE_PATH}/output" "$file")
 		index_file_html+="<li><time>${pub_date}</time> - <a href=\"$file_path\">$title</a></li>" # Add a line of HTML containing the date and title of the article
 		index_file_html+=$'\n'
@@ -204,7 +316,7 @@ gen_index_page() { # Generate an index page (site map) that includes links to th
 	echo "$index_file_html" > "$path_for_output" # Output variable to file
 	add_header_and_footer_to_index "$path_for_output" # Add header and footer to index file
-	sed -i 's/[\$][\$]title[\$][\$]/Site Map - Two More Cents/g' "$path_for_output" # Replace title variable with 'site map' title
+	sed -i 's/[\$][\$]title[\$][\$]/Site Map/g' "$path_for_output" # Replace title variable with 'site map' title
 }
@@ -226,6 +338,7 @@ setup_output_dir
 del_files_in_output
 md_to_html
 gen_sorted_file_list # Sets the 'sorted_file_list' variable
 gen_rss_feed "$sorted_file_list" # Uses the 'sorted_file_list' variable
 gen_index_page "$sorted_file_list" # Uses the 'sorted_file_list' variable
 copy_things_in
 clean_up
Author	SHA1	Message	Date
Aadhavan Srinivasan	9d5f038dad	Skip newlines and YAML metadata at the beginning of a markdown file	2025-04-19 11:28:56 -04:00
Aadhavan Srinivasan	afccdc5463	Allow files to be placed inside 'drafts' inside 'source'; will be converted to HTML, but not shown on sitemap	2025-02-16 15:36:04 -05:00
Aadhavan Srinivasan	5e48d58561	Concatenate all lua filters into 1 file, then provide that file to 'pandoc'	2025-02-15 14:21:47 -05:00
Rockingcool	628a03b2da	New feature: Exclude all files and directories inside the 'exclude' directory in 'source'	2025-01-07 12:07:34 -06:00
Aadhavan Srinivasan	71a242e1d6	Added double-slash to escape _all_ forward slashes	2024-10-11 10:52:27 -04:00
Aadhavan Srinivasan	423eba2213	Added comment to explain weird use of associative array syntax	2024-10-11 10:33:18 -04:00
Rockingcool	f1f5ab51b2	Fixed error where I would create a file called RSS_FEED_PATH	2024-07-25 10:55:03 -05:00
Rockingcool	dc61c92a4e	Added function to generate RSS file	2024-07-25 10:43:12 -05:00
Aadhavan Srinivasan	e0c79984b2	Added info to top of site-map page, and fixed bug with 'temp' directory name	2024-06-05 10:34:34 -05:00
Rockingcool	54dda40de4	Made the script take in a paramter, which is the directory containing 'source' and 'output'	2024-05-06 10:27:02 -05:00
Aadhavan Srinivasan	d098c7c290	Changed name at the top of the comments section	2024-05-04 10:21:35 -04:00
Aadhavan Srinivasan	0cb59bd30b	Updated comment section at top of file to remove information that's no longer accurate	2024-04-29 17:59:14 -04:00
Aadhavan Srinivasan	1f05a69106	Fixed issues with concurrent document conversion Before, the documents all used to have the same content due to a bug in the concurrent conversion. I fixed this by creating a function, 'convert_file()', that converts an individual document. I then called this function concurrently for each file that I want to convert. To prevent race conditions, where two parallel conversions access the same file, I ensured that each file gets placed into its own unique directory, with a randomly-generated name.	2024-04-29 17:56:58 -04:00
Aadhavan Srinivasan	0023d8ec7f	Made the document conversion multithreaded, to speed up the script (50% reduction in time)	2024-04-28 17:45:14 -04:00
Aadhavan Srinivasan	554071f7f4	Updated build script, so that 'Two More Cents' doesn't get adedd twice to the the title of the 'Site Map' page	2024-04-23 12:01:17 -04:00
Aadhavan Srinivasan	cd891e38e9	Updated build script to not throw an error if output directory doesn't exist	2024-04-10 22:05:13 -04:00
Aadhavan Srinivasan	c965b825fb	Added a few log statements	2024-04-10 21:50:16 -04:00
Aadhavan Srinivasan	1bf8064ac9	Added section to top of script, briefly describing what it does and how it works	2024-03-09 21:44:46 -05:00
Aadhavan Srinivasan	568050cb53	Added some comments to the top	2024-02-11 16:35:33 -05:00
Aadhavan Srinivasan	c82b13f3ca	Made all variables inside functions local	2024-02-11 00:17:33 -05:00