#!/usr/bin/env bash # Pull deployed data directories over SSH into a local mirror, then optionally ingest into AnythingLLM. # # Source of truth: # projects//conf.json -> smart_ide.remote_data_access.environments. # # This script never writes to remote databases. It only performs SSH/rsync reads. set -euo pipefail usage() { cat <<'EOF' Usage: ./scripts/remote-data-ssh-sync.sh [--project ] [--env ] [--mirror-root ] [--roles ] [--no-anythingllm] [--max-files ] [--max-bytes ] [--dry-run] Project/env resolution (first match): - --project / --env - SMART_IDE_PROJECT_ID / SMART_IDE_ENV - projects/active-project.json (local, gitignored) Mirror root: - SMART_IDE_REMOTE_DATA_MIRROR_ROOT, else /.data/remote-data AnythingLLM ingestion: - enabled by default (skip if AnythingLLM config or workspace slug is missing) - reads workspace slug from projects//conf.json -> smart_ide.anythingllm_workspace_slug[env] - reads ANYTHINGLLM_BASE_URL / ANYTHINGLLM_API_KEY from ~/.config/4nk/anythingllm-sync.env if present - uses scripts/anythingllm-pull-sync/sync.mjs in --upload-all mode Notes: - Requires: jq, ssh, rsync, node (>=20). - Output is not filtered; rsync output remains visible. EOF } ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" ACTIVE_PROJECT_FILE="${ROOT}/projects/active-project.json" PROJECT_ID="${SMART_IDE_PROJECT_ID:-}" ENV_NAME="${SMART_IDE_ENV:-}" MIRROR_ROOT="${SMART_IDE_REMOTE_DATA_MIRROR_ROOT:-${ROOT}/.data/remote-data}" INGEST_ANYTHINGLLM="true" ROLES_CSV="" DRY_RUN="false" MAX_FILES="${ANYTHINGLLM_SYNC_MAX_FILES:-200}" MAX_BYTES="${ANYTHINGLLM_SYNC_MAX_FILE_BYTES:-5242880}" while [[ $# -gt 0 ]]; do case "$1" in -h|--help) usage exit 0 ;; --project) PROJECT_ID="${2:-}" shift 2 ;; --env) ENV_NAME="${2:-}" shift 2 ;; --mirror-root) MIRROR_ROOT="${2:-}" shift 2 ;; --roles) ROLES_CSV="${2:-}" shift 2 ;; --no-anythingllm) INGEST_ANYTHINGLLM="false" shift 1 ;; --max-files) MAX_FILES="${2:-}" shift 2 ;; --max-bytes) MAX_BYTES="${2:-}" shift 2 ;; --dry-run) DRY_RUN="true" shift 1 ;; *) echo "[remote-data-ssh-sync][ERROR] Unknown arg: $1" >&2 usage >&2 exit 2 ;; esac done command -v jq >/dev/null 2>&1 || { echo "[remote-data-ssh-sync][ERROR] Missing dependency: jq" >&2; exit 1; } command -v ssh >/dev/null 2>&1 || { echo "[remote-data-ssh-sync][ERROR] Missing dependency: ssh" >&2; exit 1; } command -v rsync >/dev/null 2>&1 || { echo "[remote-data-ssh-sync][ERROR] Missing dependency: rsync" >&2; exit 1; } command -v node >/dev/null 2>&1 || { echo "[remote-data-ssh-sync][ERROR] Missing dependency: node" >&2; exit 1; } if [[ -z "${PROJECT_ID}" && -f "${ACTIVE_PROJECT_FILE}" ]]; then PROJECT_ID="$(jq -r '.id // empty' "${ACTIVE_PROJECT_FILE}")" fi if [[ -z "${ENV_NAME}" && -f "${ACTIVE_PROJECT_FILE}" ]]; then ENV_NAME="$(jq -r '.default_env // empty' "${ACTIVE_PROJECT_FILE}")" fi if [[ -z "${PROJECT_ID}" ]]; then echo "[remote-data-ssh-sync][ERROR] Missing project id. Provide --project or create projects/active-project.json" >&2 exit 1 fi if [[ -z "${ENV_NAME}" ]]; then ENV_NAME="test" fi case "${ENV_NAME}" in test|pprod|prod) ;; *) echo "[remote-data-ssh-sync][ERROR] Invalid --env: ${ENV_NAME} (expected test|pprod|prod)" >&2 exit 1 ;; esac CONF_FILE="${ROOT}/projects/${PROJECT_ID}/conf.json" if [[ ! -f "${CONF_FILE}" ]]; then echo "[remote-data-ssh-sync][ERROR] Missing project conf: ${CONF_FILE}" >&2 exit 1 fi SSH_HOST_ALIAS="$( jq -r ".smart_ide.remote_data_access.environments.${ENV_NAME}.ssh_host_alias // empty" "${CONF_FILE}" )" if [[ -z "${SSH_HOST_ALIAS}" || "${SSH_HOST_ALIAS}" == "null" ]]; then echo "[remote-data-ssh-sync][ERROR] Missing ssh_host_alias for ${PROJECT_ID}/${ENV_NAME} in ${CONF_FILE}" >&2 exit 1 fi mapfile -t ITEMS < <(jq -c ".smart_ide.remote_data_access.environments.${ENV_NAME}.remote_data_directories[]? // empty" "${CONF_FILE}") if [[ ${#ITEMS[@]} -eq 0 ]]; then echo "[remote-data-ssh-sync] No remote_data_directories configured for ${PROJECT_ID}/${ENV_NAME} (nothing to do)." exit 0 fi MIRROR_ROOT_ABS="${MIRROR_ROOT}" if [[ "${MIRROR_ROOT_ABS}" != /* ]]; then MIRROR_ROOT_ABS="$(cd "${ROOT}" && realpath -m "${MIRROR_ROOT_ABS}" 2>/dev/null || echo "${ROOT}/${MIRROR_ROOT_ABS}")" fi base_dir="${MIRROR_ROOT_ABS}/${PROJECT_ID}/${ENV_NAME}" mkdir -p "${base_dir}" tmp_items="$(mktemp -t remote-data-items.XXXXXX)" cleanup() { rm -f "${tmp_items}" } trap cleanup EXIT echo "[remote-data-ssh-sync] projectId=${PROJECT_ID}" echo "[remote-data-ssh-sync] env=${ENV_NAME}" echo "[remote-data-ssh-sync] sshHostAlias=${SSH_HOST_ALIAS}" echo "[remote-data-ssh-sync] mirrorRoot=${MIRROR_ROOT_ABS}" echo "[remote-data-ssh-sync] items=${#ITEMS[@]}" echo should_ingest_role() { local role="$1" if [[ -z "${ROLES_CSV}" ]]; then return 0 fi local IFS=, read -r -a allowed <<<"${ROLES_CSV}" for r in "${allowed[@]}"; do if [[ "$(echo "$r" | sed 's/[[:space:]]//g')" == "$role" ]]; then return 0 fi done return 1 } for item in "${ITEMS[@]}"; do role="$(echo "$item" | jq -r '.role // empty')" remote_path="$(echo "$item" | jq -r '.path_on_server // empty')" if [[ -z "$role" || -z "$remote_path" || "$role" == "null" || "$remote_path" == "null" ]]; then echo "[remote-data-ssh-sync][WARN] Skip invalid item: ${item}" >&2 continue fi if [[ "${remote_path}" != /* ]]; then echo "[remote-data-ssh-sync][WARN] Skip non-absolute path_on_server for role '${role}': ${remote_path}" >&2 continue fi dest="${base_dir}/${role}" echo "[remote-data-ssh-sync] rsync role=${role}" echo " from: ${SSH_HOST_ALIAS}:${remote_path}" echo " to : ${dest}" # Trailing slash to mirror directory contents into dest/ src="${SSH_HOST_ALIAS}:${remote_path%/}/" synced_json="false" if [[ "${DRY_RUN}" == "true" ]]; then echo " dry-run: rsync -a --delete -e \"ssh -o BatchMode=yes\" \"${src}\" \"${dest}/\"" else mkdir -p "${dest}" rsync -a --delete -e "ssh -o BatchMode=yes" "${src}" "${dest}/" synced_json="true" fi echo "$item" | jq -c --arg dest "$dest" --argjson synced "${synced_json}" '{ role: (.role // ""), path_on_server: (.path_on_server // ""), dest: $dest, synced: $synced }' >>"${tmp_items}" echo done manifest="${base_dir}/manifest.json" started_at="$(date -Iseconds)" if [[ -s "${tmp_items}" ]]; then jq -s \ --arg projectId "${PROJECT_ID}" \ --arg env "${ENV_NAME}" \ --arg sshHostAlias "${SSH_HOST_ALIAS}" \ --arg startedAt "${started_at}" \ --argjson dryRun "$( [[ "${DRY_RUN}" == "true" ]] && echo true || echo false )" \ '{ projectId: $projectId, env: $env, sshHostAlias: $sshHostAlias, startedAt: $startedAt, dryRun: $dryRun, items: . }' \ "${tmp_items}" >"${manifest}" else jq -n \ --arg projectId "${PROJECT_ID}" \ --arg env "${ENV_NAME}" \ --arg sshHostAlias "${SSH_HOST_ALIAS}" \ --arg startedAt "${started_at}" \ --argjson dryRun "$( [[ "${DRY_RUN}" == "true" ]] && echo true || echo false )" \ '{ projectId: $projectId, env: $env, sshHostAlias: $sshHostAlias, startedAt: $startedAt, dryRun: $dryRun, items: [] }' \ >"${manifest}" fi echo "[remote-data-ssh-sync] manifest=${manifest}" if [[ "${INGEST_ANYTHINGLLM}" != "true" ]]; then echo "[remote-data-ssh-sync] AnythingLLM ingestion disabled (--no-anythingllm)." exit 0 fi workspace_slug="$( jq -r --arg env "${ENV_NAME}" ' .smart_ide.anythingllm_workspace_slug as $s | if ($s|type) == "string" then $s elif ($s|type) == "object" then ($s[$env] // empty) else empty end ' "${CONF_FILE}" )" workspace_slug="$(echo "${workspace_slug}" | sed 's/[[:space:]]//g')" if [[ -z "${workspace_slug}" || "${workspace_slug}" == "null" ]]; then echo "[remote-data-ssh-sync] AnythingLLM: missing smart_ide.anythingllm_workspace_slug for ${PROJECT_ID}/${ENV_NAME} in ${CONF_FILE} — skip." exit 0 fi # Optional host-level AnythingLLM env file (same convention as the post-merge hook). if [[ -f "${HOME}/.config/4nk/anythingllm-sync.env" ]]; then set -a # shellcheck source=/dev/null source "${HOME}/.config/4nk/anythingllm-sync.env" set +a fi if [[ -z "${ANYTHINGLLM_BASE_URL:-}" || -z "${ANYTHINGLLM_API_KEY:-}" ]]; then echo "[remote-data-ssh-sync] AnythingLLM: missing ANYTHINGLLM_BASE_URL or ANYTHINGLLM_API_KEY — skip." exit 0 fi echo echo "[remote-data-ssh-sync] AnythingLLM ingest workspaceSlug=${workspace_slug}" echo "[remote-data-ssh-sync] limits: maxFiles=${MAX_FILES} maxBytes=${MAX_BYTES}" sync_script="${ROOT}/scripts/anythingllm-pull-sync/sync.mjs" if [[ ! -f "${sync_script}" ]]; then echo "[remote-data-ssh-sync][ERROR] Missing ${sync_script}" >&2 exit 1 fi sanitize() { echo "$1" | sed 's/[^A-Za-z0-9._-]/_/g' } ingest_ok=0 ingest_err=0 for item in "${ITEMS[@]}"; do role="$(echo "$item" | jq -r '.role // empty')" [[ -n "$role" && "$role" != "null" ]] || continue if ! should_ingest_role "$role"; then echo "[remote-data-ssh-sync] AnythingLLM: skip role=${role} (not in --roles)" continue fi dest="${base_dir}/${role}" if [[ ! -d "${dest}" ]]; then echo "[remote-data-ssh-sync] AnythingLLM: skip role=${role} (missing dest dir: ${dest})" >&2 continue fi prefix="$(sanitize "${PROJECT_ID}")__$(sanitize "${ENV_NAME}")__$(sanitize "${role}")" echo "[remote-data-ssh-sync] AnythingLLM: upload-all role=${role} (prefix=${prefix})" if [[ "${DRY_RUN}" == "true" ]]; then echo " dry-run: ANYTHINGLLM_WORKSPACE_SLUG=... node ${sync_script} --repo-root \"${dest}\" --upload-all --upload-prefix \"${prefix}\"" continue fi ANYTHINGLLM_WORKSPACE_SLUG="${workspace_slug}" \ ANYTHINGLLM_SYNC_MAX_FILES="${MAX_FILES}" \ ANYTHINGLLM_SYNC_MAX_FILE_BYTES="${MAX_BYTES}" \ node "${sync_script}" --repo-root "${dest}" --upload-all --upload-prefix "${prefix}" \ && ingest_ok=$((ingest_ok + 1)) \ || ingest_err=$((ingest_err + 1)) done if [[ "${ingest_err}" -gt 0 ]]; then echo "[remote-data-ssh-sync][ERROR] AnythingLLM ingestion failed for ${ingest_err} role(s); ok=${ingest_ok}" >&2 exit 1 fi echo "[remote-data-ssh-sync] OK (AnythingLLM ok=${ingest_ok})"