smart_ide/scripts/remote-data-ssh-sync.sh

#!/usr/bin/env bash
# Pull deployed data directories over SSH into a local mirror, then optionally ingest into AnythingLLM.
#
# Source of truth:
#   projects/<id>/conf.json -> smart_ide.remote_data_access.environments.<env>
#
# This script never writes to remote databases. It only performs SSH/rsync reads.
set -euo pipefail

usage() {
  cat <<'EOF'
Usage:
  ./scripts/remote-data-ssh-sync.sh [--project <id>] [--env <test|pprod|prod>]
                                   [--mirror-root <abs_path>]
                                   [--roles <comma_separated_roles>]
                                   [--no-anythingllm]
                                   [--max-files <n>] [--max-bytes <n>]
                                   [--dry-run]

Project/env resolution (first match):
  - --project / --env
  - SMART_IDE_PROJECT_ID / SMART_IDE_ENV
  - projects/active-project.json (local, gitignored)

Mirror root:
  - SMART_IDE_REMOTE_DATA_MIRROR_ROOT, else <smart_ide_root>/.data/remote-data

AnythingLLM ingestion:
  - enabled by default (skip if AnythingLLM config or workspace slug is missing)
  - reads workspace slug from projects/<id>/conf.json -> smart_ide.anythingllm_workspace_slug[env]
  - reads ANYTHINGLLM_BASE_URL / ANYTHINGLLM_API_KEY from ~/.config/4nk/anythingllm-sync.env if present
  - uses scripts/anythingllm-pull-sync/sync.mjs in --upload-all mode

Notes:
  - Requires: jq, ssh, rsync, node (>=20).
  - Output is not filtered; rsync output remains visible.
EOF
}

ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
ACTIVE_PROJECT_FILE="${ROOT}/projects/active-project.json"

PROJECT_ID="${SMART_IDE_PROJECT_ID:-}"
ENV_NAME="${SMART_IDE_ENV:-}"
MIRROR_ROOT="${SMART_IDE_REMOTE_DATA_MIRROR_ROOT:-${ROOT}/.data/remote-data}"

INGEST_ANYTHINGLLM="true"
ROLES_CSV=""
DRY_RUN="false"
MAX_FILES="${ANYTHINGLLM_SYNC_MAX_FILES:-200}"
MAX_BYTES="${ANYTHINGLLM_SYNC_MAX_FILE_BYTES:-5242880}"

while [[ $# -gt 0 ]]; do
  case "$1" in
    -h|--help)
      usage
      exit 0
      ;;
    --project)
      PROJECT_ID="${2:-}"
      shift 2
      ;;
    --env)
      ENV_NAME="${2:-}"
      shift 2
      ;;
    --mirror-root)
      MIRROR_ROOT="${2:-}"
      shift 2
      ;;
    --roles)
      ROLES_CSV="${2:-}"
      shift 2
      ;;
    --no-anythingllm)
      INGEST_ANYTHINGLLM="false"
      shift 1
      ;;
    --max-files)
      MAX_FILES="${2:-}"
      shift 2
      ;;
    --max-bytes)
      MAX_BYTES="${2:-}"
      shift 2
      ;;
    --dry-run)
      DRY_RUN="true"
      shift 1
      ;;
    *)
      echo "[remote-data-ssh-sync][ERROR] Unknown arg: $1" >&2
      usage >&2
      exit 2
      ;;
  esac
done

command -v jq >/dev/null 2>&1 || { echo "[remote-data-ssh-sync][ERROR] Missing dependency: jq" >&2; exit 1; }
command -v ssh >/dev/null 2>&1 || { echo "[remote-data-ssh-sync][ERROR] Missing dependency: ssh" >&2; exit 1; }
command -v rsync >/dev/null 2>&1 || { echo "[remote-data-ssh-sync][ERROR] Missing dependency: rsync" >&2; exit 1; }
command -v node >/dev/null 2>&1 || { echo "[remote-data-ssh-sync][ERROR] Missing dependency: node" >&2; exit 1; }

if [[ -z "${PROJECT_ID}" && -f "${ACTIVE_PROJECT_FILE}" ]]; then
  PROJECT_ID="$(jq -r '.id // empty' "${ACTIVE_PROJECT_FILE}")"
fi
if [[ -z "${ENV_NAME}" && -f "${ACTIVE_PROJECT_FILE}" ]]; then
  ENV_NAME="$(jq -r '.default_env // empty' "${ACTIVE_PROJECT_FILE}")"
fi
if [[ -z "${PROJECT_ID}" ]]; then
  echo "[remote-data-ssh-sync][ERROR] Missing project id. Provide --project <id> or create projects/active-project.json" >&2
  exit 1
fi
if [[ -z "${ENV_NAME}" ]]; then
  ENV_NAME="test"
fi
case "${ENV_NAME}" in
  test|pprod|prod) ;;
  *)
    echo "[remote-data-ssh-sync][ERROR] Invalid --env: ${ENV_NAME} (expected test|pprod|prod)" >&2
    exit 1
    ;;
esac

CONF_FILE="${ROOT}/projects/${PROJECT_ID}/conf.json"
if [[ ! -f "${CONF_FILE}" ]]; then
  echo "[remote-data-ssh-sync][ERROR] Missing project conf: ${CONF_FILE}" >&2
  exit 1
fi

SSH_HOST_ALIAS="$(
  jq -r ".smart_ide.remote_data_access.environments.${ENV_NAME}.ssh_host_alias // empty" "${CONF_FILE}"
)"
if [[ -z "${SSH_HOST_ALIAS}" || "${SSH_HOST_ALIAS}" == "null" ]]; then
  echo "[remote-data-ssh-sync][ERROR] Missing ssh_host_alias for ${PROJECT_ID}/${ENV_NAME} in ${CONF_FILE}" >&2
  exit 1
fi

mapfile -t ITEMS < <(jq -c ".smart_ide.remote_data_access.environments.${ENV_NAME}.remote_data_directories[]? // empty" "${CONF_FILE}")
if [[ ${#ITEMS[@]} -eq 0 ]]; then
  echo "[remote-data-ssh-sync] No remote_data_directories configured for ${PROJECT_ID}/${ENV_NAME} (nothing to do)."
  exit 0
fi

MIRROR_ROOT_ABS="${MIRROR_ROOT}"
if [[ "${MIRROR_ROOT_ABS}" != /* ]]; then
  MIRROR_ROOT_ABS="$(cd "${ROOT}" && realpath -m "${MIRROR_ROOT_ABS}" 2>/dev/null || echo "${ROOT}/${MIRROR_ROOT_ABS}")"
fi

base_dir="${MIRROR_ROOT_ABS}/${PROJECT_ID}/${ENV_NAME}"
mkdir -p "${base_dir}"

tmp_items="$(mktemp -t remote-data-items.XXXXXX)"
cleanup() {
  rm -f "${tmp_items}"
}
trap cleanup EXIT

echo "[remote-data-ssh-sync] projectId=${PROJECT_ID}"
echo "[remote-data-ssh-sync] env=${ENV_NAME}"
echo "[remote-data-ssh-sync] sshHostAlias=${SSH_HOST_ALIAS}"
echo "[remote-data-ssh-sync] mirrorRoot=${MIRROR_ROOT_ABS}"
echo "[remote-data-ssh-sync] items=${#ITEMS[@]}"
echo

should_ingest_role() {
  local role="$1"
  if [[ -z "${ROLES_CSV}" ]]; then
    return 0
  fi
  local IFS=,
  read -r -a allowed <<<"${ROLES_CSV}"
  for r in "${allowed[@]}"; do
    if [[ "$(echo "$r" | sed 's/[[:space:]]//g')" == "$role" ]]; then
      return 0
    fi
  done
  return 1
}

for item in "${ITEMS[@]}"; do
  role="$(echo "$item" | jq -r '.role // empty')"
  remote_path="$(echo "$item" | jq -r '.path_on_server // empty')"
  if [[ -z "$role" || -z "$remote_path" || "$role" == "null" || "$remote_path" == "null" ]]; then
    echo "[remote-data-ssh-sync][WARN] Skip invalid item: ${item}" >&2
    continue
  fi
  if [[ "${remote_path}" != /* ]]; then
    echo "[remote-data-ssh-sync][WARN] Skip non-absolute path_on_server for role '${role}': ${remote_path}" >&2
    continue
  fi

  dest="${base_dir}/${role}"

  echo "[remote-data-ssh-sync] rsync role=${role}"
  echo "  from: ${SSH_HOST_ALIAS}:${remote_path}"
  echo "  to  : ${dest}"

  # Trailing slash to mirror directory contents into dest/
  src="${SSH_HOST_ALIAS}:${remote_path%/}/"
  synced_json="false"
  if [[ "${DRY_RUN}" == "true" ]]; then
    echo "  dry-run: rsync -a --delete -e \"ssh -o BatchMode=yes\" \"${src}\" \"${dest}/\""
  else
    mkdir -p "${dest}"
    rsync -a --delete -e "ssh -o BatchMode=yes" "${src}" "${dest}/"
    synced_json="true"
  fi
  echo "$item" | jq -c --arg dest "$dest" --argjson synced "${synced_json}" '{ role: (.role // ""), path_on_server: (.path_on_server // ""), dest: $dest, synced: $synced }' >>"${tmp_items}"
  echo
done

manifest="${base_dir}/manifest.json"
started_at="$(date -Iseconds)"
if [[ -s "${tmp_items}" ]]; then
  jq -s \
    --arg projectId "${PROJECT_ID}" \
    --arg env "${ENV_NAME}" \
    --arg sshHostAlias "${SSH_HOST_ALIAS}" \
    --arg startedAt "${started_at}" \
    --argjson dryRun "$( [[ "${DRY_RUN}" == "true" ]] && echo true || echo false )" \
    '{ projectId: $projectId, env: $env, sshHostAlias: $sshHostAlias, startedAt: $startedAt, dryRun: $dryRun, items: . }' \
    "${tmp_items}" >"${manifest}"
else
  jq -n \
    --arg projectId "${PROJECT_ID}" \
    --arg env "${ENV_NAME}" \
    --arg sshHostAlias "${SSH_HOST_ALIAS}" \
    --arg startedAt "${started_at}" \
    --argjson dryRun "$( [[ "${DRY_RUN}" == "true" ]] && echo true || echo false )" \
    '{ projectId: $projectId, env: $env, sshHostAlias: $sshHostAlias, startedAt: $startedAt, dryRun: $dryRun, items: [] }' \
    >"${manifest}"
fi
echo "[remote-data-ssh-sync] manifest=${manifest}"

if [[ "${INGEST_ANYTHINGLLM}" != "true" ]]; then
  echo "[remote-data-ssh-sync] AnythingLLM ingestion disabled (--no-anythingllm)."
  exit 0
fi

workspace_slug="$(
  jq -r --arg env "${ENV_NAME}" '
    .smart_ide.anythingllm_workspace_slug as $s
    | if ($s|type) == "string" then $s
      elif ($s|type) == "object" then ($s[$env] // empty)
      else empty end
  ' "${CONF_FILE}"
)"
workspace_slug="$(echo "${workspace_slug}" | sed 's/[[:space:]]//g')"
if [[ -z "${workspace_slug}" || "${workspace_slug}" == "null" ]]; then
  echo "[remote-data-ssh-sync] AnythingLLM: missing smart_ide.anythingllm_workspace_slug for ${PROJECT_ID}/${ENV_NAME} in ${CONF_FILE} — skip."
  exit 0
fi

# Optional host-level AnythingLLM env file (same convention as the post-merge hook).
if [[ -f "${HOME}/.config/4nk/anythingllm-sync.env" ]]; then
  set -a
  # shellcheck source=/dev/null
  source "${HOME}/.config/4nk/anythingllm-sync.env"
  set +a
fi

if [[ -z "${ANYTHINGLLM_BASE_URL:-}" || -z "${ANYTHINGLLM_API_KEY:-}" ]]; then
  echo "[remote-data-ssh-sync] AnythingLLM: missing ANYTHINGLLM_BASE_URL or ANYTHINGLLM_API_KEY — skip."
  exit 0
fi

echo
echo "[remote-data-ssh-sync] AnythingLLM ingest workspaceSlug=${workspace_slug}"
echo "[remote-data-ssh-sync] limits: maxFiles=${MAX_FILES} maxBytes=${MAX_BYTES}"

sync_script="${ROOT}/scripts/anythingllm-pull-sync/sync.mjs"
if [[ ! -f "${sync_script}" ]]; then
  echo "[remote-data-ssh-sync][ERROR] Missing ${sync_script}" >&2
  exit 1
fi

sanitize() {
  echo "$1" | sed 's/[^A-Za-z0-9._-]/_/g'
}

ingest_ok=0
ingest_err=0

for item in "${ITEMS[@]}"; do
  role="$(echo "$item" | jq -r '.role // empty')"
  [[ -n "$role" && "$role" != "null" ]] || continue
  if ! should_ingest_role "$role"; then
    echo "[remote-data-ssh-sync] AnythingLLM: skip role=${role} (not in --roles)"
    continue
  fi
  dest="${base_dir}/${role}"
  if [[ ! -d "${dest}" ]]; then
    echo "[remote-data-ssh-sync] AnythingLLM: skip role=${role} (missing dest dir: ${dest})" >&2
    continue
  fi
  prefix="$(sanitize "${PROJECT_ID}")__$(sanitize "${ENV_NAME}")__$(sanitize "${role}")"
  echo "[remote-data-ssh-sync] AnythingLLM: upload-all role=${role} (prefix=${prefix})"

  if [[ "${DRY_RUN}" == "true" ]]; then
    echo "  dry-run: ANYTHINGLLM_WORKSPACE_SLUG=... node ${sync_script} --repo-root \"${dest}\" --upload-all --upload-prefix \"${prefix}\""
    continue
  fi

  ANYTHINGLLM_WORKSPACE_SLUG="${workspace_slug}" \
  ANYTHINGLLM_SYNC_MAX_FILES="${MAX_FILES}" \
  ANYTHINGLLM_SYNC_MAX_FILE_BYTES="${MAX_BYTES}" \
    node "${sync_script}" --repo-root "${dest}" --upload-all --upload-prefix "${prefix}" \
    && ingest_ok=$((ingest_ok + 1)) \
    || ingest_err=$((ingest_err + 1))
done

if [[ "${ingest_err}" -gt 0 ]]; then
  echo "[remote-data-ssh-sync][ERROR] AnythingLLM ingestion failed for ${ingest_err} role(s); ok=${ingest_ok}" >&2
  exit 1
fi
echo "[remote-data-ssh-sync] OK (AnythingLLM ok=${ingest_ok})"