aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.gitignore1
-rw-r--r--README.md85
-rwxr-xr-xssync-fetch114
-rwxr-xr-xssync-index157
-rwxr-xr-xssync-queue102
5 files changed, 459 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
index 700b956..44727af 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,3 @@
creds
ssync-cron
+ssync.conf
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..3b8c72f
--- /dev/null
+++ b/README.md
@@ -0,0 +1,85 @@
+# ssync
+
+ssync is a suite of utilities that facilitate syncing a remote device with a local one regardless of file structure. For example if your remote storage's structure and design differs from your local - rsync wouldn't be able to easily 1:1 fetch everything neatly; so ssync works by indexing all of the files it hasn't yet - then queues them for download to single local target dir; then you just have to do whatever you want locally, separately, to reorganize things. I generally use [automv](https://git.senders.io/senders/automv) for common/repeat fetches.
+
+## setup
+
+ssync works via three main modules:
+
+- `ssync-index` - a script that indexes a directory (run separately from remote and local target)
+- `ssync-queue` - generates the queue for `-fetch` to fetch
+- `ssync-fetch` - fetches off of the queue
+
+With the optional `ssync` bundled as a single executable that runs in the typical aka "my desired" use-case.
+
+The typical process is:
+
+1) have a cron execute `ssync` on the local/target system
+2) it will run `index` against the remote and local systems, refreshing the index for anything new
+3) it will then run `queue` which generates the queue of files yet-to-be-downloaded from the index, appending any files to the queue
+4) finally run `fetch` to process through the queue
+
+## logic flow
+
+- (s) establish a lock
+- (i) Generate the remote and local indexes
+- (q) Compare remote to local adding left side diff to the queue
+- (f) iterate over the queue
+ - (f) check if the file exists
+ - (f) if not download
+- (s) report queue diff, process duration, and status
+- (s) release lock
+
+### Notes on queue maintenance
+
+Rather than maintain a queue, each process will generate its own queue from the index diffs. Each queue is placed in a specified directory, defaulting to `$XDG_CACHE_DIR/ssync/queue/` (if not set uses `/home/$USER/.config/` if exists, else creates and stores in `/tmp/ssync/queue/`)
+
+Since it runs off of process local indexes the queue can be reaped between processes and not incur any potential data loss.
+
+### Index window
+
+So long as files on the local system are expect to persist longer than it does on the remote you'll always be safe. But the index window helps set a maximum lookback - so that any older files may be removed from the local system without being resynced.
+
+In a previous implementation of this - not starting from scratch each run led to examples where a set of sequential files had a missing file in between. Like img-1, img-2, img-3, img-5, img-6: with img-4 missing. It was annoying and easy to not notice.
+
+## Configuration
+
+ssync.conf
+
+```config
+remote_host=HOST
+remote_root_dir=/path/to/sync/root/
+keyfile=/path/to/key
+local_root_dir=/path/to/local/sync/root/
+
+index_window_s=86400 # 24 hours
+index_dir=/path/to/index/dir # optional
+queue_dir=/path/to/queue/dir # optional
+lock_file=/path/to/desired/file.lock # optional
+```
+
+## commands
+
+```
+ssync [options]
+ OPTIONS
+ -c [CONFIG_FILE] optional config file to use
+ default: ~/.config/ssync/ssync.conf
+ -l [LOCK_FILE] optional lock file
+ default: from config)
+ -q [QUEUE_DIR] optional queue dir
+ default: from config)
+ -k [KEY_FILE] optional key file
+ default: from config)
+```
+
+```
+ssync-index [options] -c [FILE]
+ REQUIRED
+ -c [CONFIG_FILE] config file to use
+ OPTIONS
+ -l local only (cannot be used in conjunction with -r)
+ -r remote only (cannot be used in conjunction with -l)
+ -o [OUTPUT_FILE] output file override
+ -k [KEY_FILE] key file override
+```
diff --git a/ssync-fetch b/ssync-fetch
new file mode 100755
index 0000000..2a79e64
--- /dev/null
+++ b/ssync-fetch
@@ -0,0 +1,114 @@
+#!/usr/bin/env sh
+
+USAGE="ssync-fetch [options] QUEUE_FILE DEST_DIR
+ OPTIONS
+ -r REMOTE_HOST
+ remote host to download from such as user@hostname
+ username can be omitted if identical to $USER
+ or if set in ssh_config
+ -k KEY_FILE
+ ssh-key file to use (needs to be non-interactive)
+ optional: will use default session key
+ or key set in ssh_config for REMOTE_HOST
+ -v verbose logging
+ -h print this message"
+
+# HELPER FUNCTIONS
+
+function verbose_log {
+ if [ ! -z "$VERBOSE_FLAG" ]; then
+ echo "$@"
+ fi
+}
+
+# OPTIONS
+
+KEY_FILE_FLAG=
+KEY_FILE_ARG=
+REMOTE_HOST_FLAG=
+REMOTE_HOST_ARG=
+VERBOSE_FLAG=
+
+while getopts "hvr:k:" opt; do
+ case "${opt}" in
+ h) echo "$USAGE"
+ exit 1
+ ;;
+ v) VERBOSE_FLAG=1
+ ;;
+ k) KEY_FILE_FLAG=1
+ KEY_FILE_ARG="${OPTARG}"
+ ;;
+ c) CONCURRENCY_FLAG=1
+ CONCURRENCY_ARG="${OPTARG}"
+ ;;
+ r) REMOTE_HOST_FLAG=1
+ REMOTE_HOST_ARG="${OPTARG}"
+ ;;
+ esac
+done
+
+shift $(($OPTIND -1))
+
+if [ $# -ne 2 ]; then
+ echo "$USAGE"
+ exit 1
+fi
+
+QUEUE_FILE=$1
+DEST_DIR=$2
+
+# VALIDATION
+
+if [ ! -f "$QUEUE_FILE" ]; then
+ echo "Queue file '$QUEUE_FILE' does not exist"
+ exit 1
+fi
+
+if [ ! -d "$DEST_DIR" ]; then
+ echo "Destination directory '$DEST_DIR' does not exist"
+ exit 1
+fi
+
+if [ -z "$REMOTE_HOST_FLAG" ]; then
+ echo "Remote host option -r required"
+ exit 1
+elif [ -z "$REMOTE_HOST_ARG" ]; then
+ echo "Invalid remote host '$REMOTE_HOST_ARG'"
+ exit 1
+fi
+
+# CONFIGURATIONS
+
+ssh_id_param=""
+if [ ! -z "${KEY_FILE_FLAG}" ]; then
+ if [ ! -f "${KEY_FILE_ARG}" ]; then
+ echo "Identity file '${KEY_FILE_ARG}' does not exist"
+ exit 1
+ fi
+ ssh_id_param="-i ${KEY_FILE_ARG}"
+fi
+
+concurrent_param=""
+if [ ! -z "${CONCURRENCY_FLAG}" ]; then
+ if [ "${CONCURRENCY_ARG}" -gt 0 ]; then
+ concurrent_param="-X nrequests=${CONCURRENCY_ARG}"
+ fi
+fi
+real_dest=$(realpath $DEST_DIR)
+tmp_dir=$(mktemp -d /tmp/ssync_fetch_run.XXXXXX)
+ts=$(date +%s)
+verbose_log "Writing temp files to ${tmp_dir} with timestamp ${ts}"
+
+# GENERATE BATCH
+batch_file=$tmp_dir/batch_${ts}
+verbose_log "Converting the queue file to sftp batch file: ${batch_file}"
+cat $QUEUE_FILE | xargs -I{} echo "@reget {} ${real_dest}/" >> $batch_file
+
+verbose_log "Beginning download"
+verbose_log "sftp -N ${ssh_id_param} -b ${batch_file} ${REMOTE_HOST_ARG}"
+
+sftp -N ${ssh_id_param} -b ${batch_file} ${REMOTE_HOST_ARG}
+
+verbose_log "ssync-fetch finished"
+
diff --git a/ssync-index b/ssync-index
new file mode 100755
index 0000000..e19ee78
--- /dev/null
+++ b/ssync-index
@@ -0,0 +1,157 @@
+#!/usr/bin/env sh
+
+USAGE="ssync-index [options] [CONFIG_FILE]
+ OPTIONS
+ -k [KEY_FILE] override conigured key file
+ -o [OUTPUT_DIR] override configured index output dir
+ -l local only
+ -r remote only
+ -v verbose logging
+ -h print this message"
+
+# HELPER METHODS
+function verbose_log {
+ if [ ! -z "$VERBOSE_FLAG" ]; then
+ echo "$1"
+ fi
+}
+function lines {
+ wc -l $1 | cut -d' ' -f1
+}
+function get_default_index_dir {
+ local cache_path=ssync/index/
+ local cache_dir=
+ if [ ! -z "$XDG_CACHE_DIR" ]; then
+ cache_dir=$XDG_CACHE_DIR/$cache_path
+ else
+ cache_dir=/tmp/
+ fi
+
+ mkdir -p $cache_dir/$cache_path
+ echo $cache_dir/$cache_path
+}
+
+# GLOBAL VALUES
+DEFAULT_INDEX_WINDOW=86400
+LOCAL_ONLY_FLAG=
+REMOTE_ONLY_FLAG=
+VERBOSE_FLAG=
+KEY_FILE_FLAG=
+KEY_FILE_ARG=
+OUTPUT_DIR_FLAG=
+OUTPUT_DIR_ARG=
+
+# OPTIONS PARSING
+while getopts "hvlro:k:" opt; do
+ case "${opt}" in
+ h) echo "$USAGE"
+ exit 1
+ ;;
+ o) OUTPUT_DIR_FLAG=1
+ OUTPUT_DIR_ARG="$OPTARG"
+ ;;
+ k) KEY_FILE_FLAG=1
+ KEY_FILE_ARG="$OPTARG"
+ ;;
+ l) LOCAL_ONLY_FLAG=1
+ ;;
+ r) REMOTE_ONLY_FLAG=1
+ ;;
+ v) VERBOSE_FLAG=1
+ ;;
+ esac
+done
+
+shift $(($OPTIND -1))
+
+if [ $# -eq 0 ]; then
+ echo "$USAGE"
+ exit 1
+
+fi
+
+CONFIG_FILE=$1
+
+# CONFIG PARSING
+if [ -z "$CONFIG_FILE" ]; then
+ echo "CONFIG_FILE is missing."
+ exit 1
+fi
+
+if [ ! -f $CONFIG_FILE ]; then
+ echo "$CONFIG_FILE does not exist."
+ exit 1
+fi
+
+source $CONFIG_FILE
+
+# CONFIG VALIDATION
+if [ -z "$remote_host" ]; then
+ echo "Config is missing remote_host"
+ exit 1
+fi
+
+if [ -z "$remote_root_dir" ]; then
+ echo "Config is missing remote_root_dir"
+ exit 1
+fi
+
+if [ -z "$keyfile" ]; then
+ echo "Config is missing keyfile"
+ exit 1
+elif [ ! -f "$keyfile" ]; then
+ echo "Configured keyfile '$keyfile' does not exist"
+ exit 1
+fi
+
+if [ -z "$local_root_dir" ]; then
+ echo "Config is missing local_root_dir"
+ exit 1
+elif [ ! -d "$local_root_dir" ]; then
+ echo "Configured local_root_dir '$local_root_dir' does not exist"
+ exit 1
+fi
+
+# VALUE SETTING
+
+if [ -z "$index_window_s" ]; then
+ index_window_s=$DEFAULT_INDEX_WINDOW
+fi
+
+local_index_file=
+remote_index_file=
+if [ ! -z "$OUTPUT_DIR_FLAG" ]; then
+ index_dir=$OUTPUT_DIR_ARG
+fi
+
+if [ ! -z "$index_dir" ]; then
+ if [ ! -d "$index_dir" ]; then
+ echo "Configured index_dir '$index_dir' does not exist"
+ exit 1
+ fi
+else
+ index_dir=$(get_default_index_dir)
+fi
+
+# RUN INDEXING
+
+ts=$(date -u +%s)
+local_index_file=${index_dir}/${ts}_local.idx
+remote_index_file=${index_dir}/${ts}_remote.idx
+newermt=$(date -d "$index_window_s seconds ago" -u -Is)
+verbose_log "Targetting files from $index_window_s seconds ago: $newermt"
+
+# index remote host relative to your fetch dir
+if [ -z "$LOCAL_ONLY_FLAG" ]; then
+ verbose_log "Generating remote index from ${remote_root_dir} to ${remote_index_file}"
+ ssh -i $keyfile $remote_host \
+ "find ${remote_root_dir} -type f -newermt $newermt -exec realpath {} \;" > $remote_index_file
+ verbose_log "Indexed $(lines $remote_index_file) remote files"
+fi
+
+# index local filenames ONLY
+if [ -z "$REMOTE_ONLY_FLAG" ]; then
+ verbose_log "Generate local index from ${local_root_dir} to ${local_index_file}"
+ find ${local_root_dir} -type f -exec basename {} \; > $local_index_file
+ verbose_log "Indexed $(lines $local_index_file) local files"
+fi
diff --git a/ssync-queue b/ssync-queue
new file mode 100755
index 0000000..4f98731
--- /dev/null
+++ b/ssync-queue
@@ -0,0 +1,102 @@
+#!/usr/bin/env sh
+
+USAGE="ssync-queue [options] -l LOCAL_INDEX_FILE -r REMOTE_INDEX_FILE -q QUEUE_OUTPUT_FILE
+ OPTIONS
+ -l LOCAL_INDEX_FILE
+ target local index file
+ -r REMOTE_INDEX_FILE
+ target remote index file
+ -q QUEUE_OUTPUT_FILE
+ queue output file
+ -v verbose logging
+ -h print this message"
+
+# HELPER FUNCTIONS
+
+function verbose_log {
+ if [ ! -z "$VERBOSE_FLAG" ]; then
+ echo "$@"
+ fi
+}
+function lines {
+ echo $(wc -l $1 | cut -d' ' -f1)
+}
+
+# OPTIONS
+
+VERBOSE_FLAG=
+LOCAL_FILE_FLAG=
+REMOTE_FILE_FLAG=
+QUEUE_FILE_FLAG=
+CONFIG_FILE_FLAG=
+LOCAL_FILE_ARG=
+REMOTE_FILE_ARG=
+QUEUE_FILE_ARG=
+
+while getopts "hvl:r:q:c:" opt; do
+ case "${opt}" in
+ l) LOCAL_FILE_FLAG=1
+ LOCAL_FILE_ARG="${OPTARG}"
+ ;;
+ r) REMOTE_FILE_FLAG=1
+ REMOTE_FILE_ARG="${OPTARG}"
+ ;;
+ q) QUEUE_FILE_FLAG=1
+ QUEUE_FILE_ARG="${OPTARG}"
+ ;;
+ v) VERBOSE_FLAG=1
+ ;;
+ h) echo "$USAGE"
+ exit 1
+ ;;
+ esac
+done
+
+shift $(($OPTIND -1))
+
+if [ -z "$LOCAL_FILE_FLAG" ]; then
+ echo "-l LOCAL_INDEX_FILE option is required"
+ exit 1
+elif [ ! -f "$LOCAL_FILE_ARG" ]; then
+ echo "local index file '$LOCAL_FILE_ARG' does not exist"
+ exit 1
+fi
+
+if [ -z "$REMOTE_FILE_FLAG" ]; then
+ echo "-r REMOTE_INDEX_FILE option is required"
+ exit 1
+elif [ ! -f "$REMOTE_FILE_ARG" ]; then
+ echo "remote index file '${REMOTE_FILE_ARG}' does not exist"
+ exit 1
+fi
+
+if [ -z "$QUEUE_FILE_FLAG" ]; then
+ echo "-q QUEUE_OUTPUT_FILE option is required"
+ exit 1
+fi
+
+queue_tmp_dir=$(mktemp -d /tmp/ssync_queue_run.XXXXXX)
+verbose_log "Writing temp files to $queue_tmp_dir"
+
+# get remote filenames
+remote_index_filenames_file=$queue_tmp_dir/remote_filenames.idx
+verbose_log "Writing remote index filenames to $remote_index_filenames_file"
+cat $REMOTE_FILE_ARG | xargs -I{} basename {} > $remote_index_filenames_file
+original_line_count=$(lines $REMOTE_FILE_ARG)
+unique_line_count=$(lines <(sort -u $remote_index_filenames_file))
+verbose_log "Remote index contains $unique_line_count unique filenames out of $original_line_count indexed files"
+
+if [ $original_line_count != $unique_line_count ]; then
+ echo "Remote index contains non-unique files. Check $REMOTE_INDEX_FILE to find which files aren't unique" >&2
+fi
+
+# find which filenames are unique to the remote
+remote_only_filenames_file=$queue_tmp_dir/remote_only_filenames.idx
+comm -23 <(sort $remote_index_filenames_file) <(sort $LOCAL_FILE_ARG) \
+ > $remote_only_filenames_file
+verbose_log "Found $(lines $remote_only_filenames_file) remote only files"
+
+# push matching files into queue
+cat $remote_only_filenames_file | xargs -I{} grep "^.*{}$" $REMOTE_FILE_ARG >> $QUEUE_FILE_ARG
+verbose_log "Added $(lines $QUEUE_FILE_ARG) to the queue"
+