#!/bin/bash
#  file - file and directory support library

set -e

lib_load 'core/qsort'


######
# Library Settings

core_file_config_init() {
	# $file_find_opts[] - Passed to find(1) when called by file_find.
	lib_setting_arrays file_find_opts

	# $file_find_sort - If set to `true`, output from ``file_find()``
	# will be sorted.
	lib_setting_vars file_find_sort
	file_find_sort=false

	# $file_hashbin - Name of default hashing program
	lib_setting_vars file_hashbin
	file_hashbin=sha1sum

	# $file_hash_texts[] - Hashing algorithm output (see ``file_hash()``).
	# $file_hash_name[] - Hashed file names (see ``file_hash()``).
	# $file_hash_sizes[] - Hashed file sizes (see ``file_hash()``).
	# $file_hash_modes[] - Hashed file modes (see ``file_hash()``).
	lib_setting_arrays \
		file_hash_texts file_hash_names \
		file_hash_sizes file_hash_modes
}


######
# File I/O

# file_read() - Reads file ($2) into an array variable ($1)
file_read() {
	local var=$1
	local name=$2
	readarray -t $var <"$name"
}

# file_read_func() - Reads file ($1) and calls a command ($@) with
# lines as its arguments.
#
# $1 - File name
# $@ - Command to receive the file contents as arguments
file_read_func() {
	local name=$1
	shift

	local -a lines=()
	file_read lines "$name"

	"$@" "${lines[@]}"
}

# file_capture_func() - Capture the output of a command ($@) to a
# temporary file, storing the name of the file in a named variable ($1).
#
# $1 - Name of variable to receive the name of the temporary file.
# $2 - Command to execute
# $@ - Command arguments (optional)
file_capture_func() {
	min_args 2 "$@"
	local -n name=$1
	shift

	name=$(cmd_tempfile)
	[ "$name" ] || error "unable to create temporary file"

	"$@" >"$name"
}


# file_capture() - Captures `stdin` to a temporary file, storing
# the name of the file in a named variable ($1).
# $1 - Name of variable to receive the name of the temporary file.
file_capture() {
	has_args 1 "$@"
	file_capture_func "$1" cat
}


######
# File Variable I/O

# file_var_func() - Captures the output of a command ($@) and
# assigns its output to the given variable ($1).
# $1 - Name of variable to receive output from the command.
# $2 - Command to execute
# $@ - Command arguments (optional)
file_var_func() {
	min_args 2 "$@"
	local -n var=$1
	shift

	local -a lines
	file_array_func lines "$"@
	var=${lines[*]}
}

# file_array_func() - Captures the output of a command ($@) and
# assigns its output to the given array variable ($1).
# $1 - Name of array variable to receive lines from the command.
# $2 - Command to execute
# $@ - Command arguments (optional)
file_array_func() {
	min_args 2 "$@"
	local var=$1
	shift

	local str
	file_capture_func str "$@"
	file_read $var "$str"
}


######
# Directory and Path Functions

# run_mkdir() - Wraps ``mkdir -p``.  This function should be used instead
# of the bare command in most cases.
run_mkdir() {
	local opts
	! $debug || opts=-v
	run mkdir -p $opts "$@" >&2
}

# run_pushd() - Wraps ``pushd`` to be silent or report a useful error
run_pushd() { run pushd "$1" >/dev/null || error "$1: pushd failed"; }

# run_popd() - Wraps ``popd`` to be silent
run_popd() { run popd >/dev/null; }

# run_in_dir() - Runs the command ($@) after changing a new directory ($1).
# $1 - New directory
# $@ - Command to run
run_in_dir() {
	min_args 2  "$@"
	local dir=$1
	shift

	run_pushd "$dir"
	run "$@"
	run_popd
}

# file_readdir() - Prints list of files in the given directory ($1).
# $1 - Directory to search
# $@ - Optional arguments
file_readdir() {
	local dir=$1
	shift
	run find "$dir" -mindepth 1 -maxdepth 1 "$@" -exec basename '{}' ';'
}

short_path() {
	local name=$1
	local dir dname prefix
	dir=$(dirname "$name")
	dir=$(realpath "$dir")
	if [ "$dir" != '.' ]; then
		dir=$(basename "$dir")
		prefix=".../"
	fi
	name=$(basename "$name")
	echo "$prefix$dir/$name"
}


# file_mkdir() - Creates directory portion of file name ($1).
file_mkdir() {
	has_args 1 "$@"
	local f=$1
	local dir
	dir=$(dirname "$f")
	[ -d "$dir" ] || run_mkdir "$dir"
}


######
# Finding Files

# file_find - Fills an array with names of things found by ``find(1)``.
# The ``$file_find_opts[]`` variable controls the behavior of this function,
# allowing additional arguments to be passed to ``find``.
# $1 - Name of array variable
# $@ - Path names to search
file_find() {
	min_args 2 "$@"
	local var=$1
	shift

	local fftmp
	fftmp=$(cmd_tempfile)
	find "$@" "${file_find_opts[@]}" -print >"$fftmp"

	local -a __files
	readarray -t $var <"$fftmp"

	! $file_find_sort || qsort_list $var
}


######
# File Checking

# file_size() - Prints the size of the given file(s)
# $@ - File names
file_size() { stat -c'%s' "$@"; }

# file_test() - Performs a existential test ($1) on the given files ($@)
# using the ``test`` shell command.
file_test() {
	min_args 2 "$@"

	local tst=$1
	shift

	local i
	for i in "$@"; do
		test $tst "$i" || error "$i: not found"
	done
}

# file_is_terminal() - Returns success if the named fd ($1) is a terminal.
file_is_terminal() {
	has_args 1 "$@"
	local fd=$1
	case "$1" in
	(stdin) fd=0 ;;
	(stdout) fd=1 ;;
	(stderr) fd=2 ;;
	esac
	[ -t $fd ]
}


######
# File Hashing

# file_hash_raw() - Runs the ``$file_hashbin`` program on the given files ($@).
file_hash_raw() { min_args 1 "$@"; run $file_hashbin "$@"; }

# file_hash_list() - Runs ``file_hash_raw()`` on the given files ($@)
# and reads the results in the named array variable ($1).
file_hash_list() {
	local var=$1
	shift

	local tmp
	tmp=$(cmd_tempfile)
	file_hash_raw "$@" >"$tmp"

	readarray -t $var <"$tmp"
}

# file_hash() - Loads the hash information for the given files ($@) into
# the ``file_hash_*`` arrays
file_hash() {
	min_args 1 "$@"
	local -a lines
	file_hash_list lines "$@"
	for_each _file_hash_split "${lines[@]}"
}

# _file_hash_split() - Splits apart the hash results ($1) for a single file
# and appends the various pieces to the respective ``$file_hash_*[]`` arrays.
_file_hash_split() {
	local line=$1

	local text=${line%% *}
	local name=${line#* }
	local mode=${name:0:1}

	# remove mode from name
	local len=$((${#name} - 1))
	name=${name:1:$len}

	local size
	size=$(file_size "$name")

	list_append file_hash_names "$name"
	list_append file_hash_modes "$mode"
	list_append file_hash_texts "$text"
	list_append file_hash_sizes "$size"
}

# file_hash_print() - Prints the stored hash information.
# This function exists primarily for verification purposes.
file_hash_print() {
	local count=$((${#file_hash_names[@]} - 1))
	for_each _file_hash_print_index $(seq 0 $count)
}

# _file_hash_print_index() - Prints stored hash information for a single file.
# $1 - Index into ``$file_hash_*[]`` arrays
_file_hash_print_index() {
	local i=$1
	local text name size mode
	text=${file_hash_texts[$i]}
	name=${file_hash_names[$i]}
	size=${file_hash_sizes[$i]}
	mode=${file_hash_modes[$i]}
	echo "$text $mode$name ($size)"
}


######
# Duplicate Detection

# file_duplicate_scan() - Scans the provided paths ($@) for duplicate files.
file_duplicate_scan() {
	min_args 1 "$@"

	info "duplicate scan started..."

	local -a fd_files
	local file_find_opts=( -type f )
	file_find fd_files "$@"

	info "hashing ${#fd_files[@]} files..."

	file_hash "${fd_files[@]}"
	! $pretend || error "file duplicate detection"

	local -A fd_hashmap
	local -A fd_dups
	for_each _file_duplicate_scan_index $(seq 0 $((${#fd_files[@]} - 1)))

	info "printing list of duplicates..."

	local -a dups=( "${fd_dups[@]}" )
	list_unique dups
	qsort_list dups

	for_each _file_duplicate_scan_check "${dups[@]}"

	info "duplicate scan... done."
}

# _file_duplicate_scan_index() - Checks an individual file for duplicity.
_file_duplicate_scan_index() {
	local i=$1
	local text name
	text=${file_hash_texts[$i]}
	name=${file_hash_names[$i]}

	if [ "${fd_hashmap[$text]}" ]; then
		fd_dups[$name]=$text
	else
		fd_hashmap[$text]=$name
	fi
}

# _file_duplicate_scan_check() - Prints the full list of duplicate files.
# The first of a group of duplicates files is prefixed with ``-``;  the
# remaining duplicates in the group are prefixed with ``+``.
_file_duplicate_scan_check() {
	local dup_text=$1
	local dup_name=${fd_hashmap[$dup_text]}
	echo "-$dup_name"
	for i in $(seq 0 $((${#fd_files[@]} - 1))); do
		local text=${file_hash_texts[$i]}
		[[ "$text" == "$dup_text" ]] || continue
		local name=${file_hash_names[$i]}
		[[ "$name" != "$dup_name" ]] || continue
		echo "+$name"
	done
}

# file_duplicate_scan_list() - Calls ``file_duplicate_scan "$@"`` and
# stores the results in an array variable ($1).
file_duplicate_scan_list() {
	local var=$1
	shift

	local tmp
	tmp=$(cmd_tempfile)
	file_duplicate_scan "$@" >"$tmp"

	readarray -t $var <"$tmp"
}

# for_each_file_duplicate() - Iterates through a list of duplicates, as
# produced by ``file_duplicate_scan_list()``.  For each duplicate, a
# given function ($1) will be called with two arguments: the original and
# duplicate file names.
for_each_file_duplicate() {
	local func=$1
	shift

	local item src dst
	for item in "$@"; do
		case "$item" in
		(-*) src=${item:1} ;;
		(+*) dst=${item:1}; $func "$src" "$dst" ;;
		(*) error "$item: unknown duplicate item" ;;
		esac
	done
}


######
# Duplicate Elimination

# file_duplicate_symlink() - Finds all duplicates and eliminates them
# by replacing them with symbolic links.
file_duplicate_symlink() {
	local file_duplicate_link_type=soft
	file_duplicate_link "$@"
}

# file_duplicate_link() - Finds all duplicates and eliminates them
# by replacing them with links.
file_duplicate_link() {
	local confirm_func=$1
	shift

	local -a dups
	file_duplicate_scan_list dups "$@"

	local file_duplicate_link_type=${file_duplicate_link_type:-hard}
	case "$file_duplicate_link_type" in
	hard) ;;
	soft) file_duplicate_link_opt="-s" ;;
	*) error "$file_duplicate_link_type: unknown link type" ;;
	esac

	$confirm_func "${dups[@]}" || error "$__FUNCNAME__: aborted"

	for_each_file_duplicate _file_duplicate_link_item "${dups[@]}"
}

_file_duplicate_link_item() {
	local src=$1
	local dst=$2
	info "linking: '$src' -> '$dst'"
	run rm "$dst"
	run ln ${file_duplicate_link_opt} "$src" "$dst"
}

# file_duplicate_delete() - Finds duplicates and deletes them.
file_duplicate_delete() {
	local confirm_func=$1
	shift

	local -a dups
	file_duplicate_scan_list dups "$@"

	$confirm_func "${dups[@]}" || error "$__FUNCNAME__: aborted"

	for_each_file_duplicate _file_duplicate_delete_item "${dups[@]}"
}

_file_duplicate_delete_item() {
	local dst=$2
	run rm "$dst"
}
View the Developer Guide Index

View the Reference Manual Index