Initial state: - ia_dev was historically referenced as ./ia_dev in docs and integrations, while the vendored module lives under services/ia_dev. - AnythingLLM sync and hook installation had error masking / weak exit signaling. - Proxy layers did not validate proxy path segments, allowing path normalization tricks. Motivation: - Make the IDE-oriented workflow usable (sync -> act -> deploy/preview) with explicit errors. - Reduce security footguns in proxying and script automation. Resolution: - Standardize IA_DEV_ROOT usage and documentation to services/ia_dev. - Add SSH remote data mirroring + optional AnythingLLM ingestion. - Extend AnythingLLM pull sync to support upload-all/prefix and fail on upload errors. - Harden smart-ide-sso-gateway and smart-ide-global-api proxying with safe-path checks and non-leaking error responses. - Improve ia-dev-gateway runner validation and reduce sensitive path leakage. - Add site scaffold tool (Vite/React) with OIDC + chat via sso-gateway -> orchestrator. Root cause: - Historical layout changes (submodule -> vendored tree) and missing central contracts for path resolution. - Missing validation for proxy path traversal patterns. - Overuse of silent fallbacks (|| true, exit 0 on partial failures) in automation scripts. Impacted features: - Project sync: git pull + AnythingLLM sync + remote data mirror ingestion. - Site frontends: SSO gateway proxy and orchestrator intents (rag.query, chat.local). - Agent execution: ia-dev-gateway script runner and SSE output. Code modified: - scripts/remote-data-ssh-sync.sh - scripts/anythingllm-pull-sync/sync.mjs - scripts/install-anythingllm-post-merge-hook.sh - cron/git-pull-project-clones.sh - services/smart-ide-sso-gateway/src/server.ts - services/smart-ide-global-api/src/server.ts - services/smart-ide-orchestrator/src/server.ts - services/ia-dev-gateway/src/server.ts - services/ia_dev/tools/site-generate.sh Documentation modified: - docs/** (architecture, API docs, ia_dev module + integration, scripts) Configurations modified: - config/services.local.env.example - services/*/.env.example Files in deploy modified: - services/ia_dev/deploy/* Files in logs impacted: - logs/ia_dev.log (runtime only) - .logs/* (runtime only) Databases and other sources modified: - None Off-project modifications: - None Files in .smartIde modified: - .smartIde/agents/*.md - services/ia_dev/.smartIde/** Files in .secrets modified: - None New patch version in VERSION: - 0.0.5 CHANGELOG.md updated: - yes
319 lines
10 KiB
Bash
Executable File
319 lines
10 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
# Pull deployed data directories over SSH into a local mirror, then optionally ingest into AnythingLLM.
|
|
#
|
|
# Source of truth:
|
|
# projects/<id>/conf.json -> smart_ide.remote_data_access.environments.<env>
|
|
#
|
|
# This script never writes to remote databases. It only performs SSH/rsync reads.
|
|
set -euo pipefail
|
|
|
|
usage() {
|
|
cat <<'EOF'
|
|
Usage:
|
|
./scripts/remote-data-ssh-sync.sh [--project <id>] [--env <test|pprod|prod>]
|
|
[--mirror-root <abs_path>]
|
|
[--roles <comma_separated_roles>]
|
|
[--no-anythingllm]
|
|
[--max-files <n>] [--max-bytes <n>]
|
|
[--dry-run]
|
|
|
|
Project/env resolution (first match):
|
|
- --project / --env
|
|
- SMART_IDE_PROJECT_ID / SMART_IDE_ENV
|
|
- projects/active-project.json (local, gitignored)
|
|
|
|
Mirror root:
|
|
- SMART_IDE_REMOTE_DATA_MIRROR_ROOT, else <smart_ide_root>/.data/remote-data
|
|
|
|
AnythingLLM ingestion:
|
|
- enabled by default (skip if AnythingLLM config or workspace slug is missing)
|
|
- reads workspace slug from projects/<id>/conf.json -> smart_ide.anythingllm_workspace_slug[env]
|
|
- reads ANYTHINGLLM_BASE_URL / ANYTHINGLLM_API_KEY from ~/.config/4nk/anythingllm-sync.env if present
|
|
- uses scripts/anythingllm-pull-sync/sync.mjs in --upload-all mode
|
|
|
|
Notes:
|
|
- Requires: jq, ssh, rsync, node (>=20).
|
|
- Output is not filtered; rsync output remains visible.
|
|
EOF
|
|
}
|
|
|
|
ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
|
ACTIVE_PROJECT_FILE="${ROOT}/projects/active-project.json"
|
|
|
|
PROJECT_ID="${SMART_IDE_PROJECT_ID:-}"
|
|
ENV_NAME="${SMART_IDE_ENV:-}"
|
|
MIRROR_ROOT="${SMART_IDE_REMOTE_DATA_MIRROR_ROOT:-${ROOT}/.data/remote-data}"
|
|
|
|
INGEST_ANYTHINGLLM="true"
|
|
ROLES_CSV=""
|
|
DRY_RUN="false"
|
|
MAX_FILES="${ANYTHINGLLM_SYNC_MAX_FILES:-200}"
|
|
MAX_BYTES="${ANYTHINGLLM_SYNC_MAX_FILE_BYTES:-5242880}"
|
|
|
|
while [[ $# -gt 0 ]]; do
|
|
case "$1" in
|
|
-h|--help)
|
|
usage
|
|
exit 0
|
|
;;
|
|
--project)
|
|
PROJECT_ID="${2:-}"
|
|
shift 2
|
|
;;
|
|
--env)
|
|
ENV_NAME="${2:-}"
|
|
shift 2
|
|
;;
|
|
--mirror-root)
|
|
MIRROR_ROOT="${2:-}"
|
|
shift 2
|
|
;;
|
|
--roles)
|
|
ROLES_CSV="${2:-}"
|
|
shift 2
|
|
;;
|
|
--no-anythingllm)
|
|
INGEST_ANYTHINGLLM="false"
|
|
shift 1
|
|
;;
|
|
--max-files)
|
|
MAX_FILES="${2:-}"
|
|
shift 2
|
|
;;
|
|
--max-bytes)
|
|
MAX_BYTES="${2:-}"
|
|
shift 2
|
|
;;
|
|
--dry-run)
|
|
DRY_RUN="true"
|
|
shift 1
|
|
;;
|
|
*)
|
|
echo "[remote-data-ssh-sync][ERROR] Unknown arg: $1" >&2
|
|
usage >&2
|
|
exit 2
|
|
;;
|
|
esac
|
|
done
|
|
|
|
command -v jq >/dev/null 2>&1 || { echo "[remote-data-ssh-sync][ERROR] Missing dependency: jq" >&2; exit 1; }
|
|
command -v ssh >/dev/null 2>&1 || { echo "[remote-data-ssh-sync][ERROR] Missing dependency: ssh" >&2; exit 1; }
|
|
command -v rsync >/dev/null 2>&1 || { echo "[remote-data-ssh-sync][ERROR] Missing dependency: rsync" >&2; exit 1; }
|
|
command -v node >/dev/null 2>&1 || { echo "[remote-data-ssh-sync][ERROR] Missing dependency: node" >&2; exit 1; }
|
|
|
|
if [[ -z "${PROJECT_ID}" && -f "${ACTIVE_PROJECT_FILE}" ]]; then
|
|
PROJECT_ID="$(jq -r '.id // empty' "${ACTIVE_PROJECT_FILE}")"
|
|
fi
|
|
if [[ -z "${ENV_NAME}" && -f "${ACTIVE_PROJECT_FILE}" ]]; then
|
|
ENV_NAME="$(jq -r '.default_env // empty' "${ACTIVE_PROJECT_FILE}")"
|
|
fi
|
|
if [[ -z "${PROJECT_ID}" ]]; then
|
|
echo "[remote-data-ssh-sync][ERROR] Missing project id. Provide --project <id> or create projects/active-project.json" >&2
|
|
exit 1
|
|
fi
|
|
if [[ -z "${ENV_NAME}" ]]; then
|
|
ENV_NAME="test"
|
|
fi
|
|
case "${ENV_NAME}" in
|
|
test|pprod|prod) ;;
|
|
*)
|
|
echo "[remote-data-ssh-sync][ERROR] Invalid --env: ${ENV_NAME} (expected test|pprod|prod)" >&2
|
|
exit 1
|
|
;;
|
|
esac
|
|
|
|
CONF_FILE="${ROOT}/projects/${PROJECT_ID}/conf.json"
|
|
if [[ ! -f "${CONF_FILE}" ]]; then
|
|
echo "[remote-data-ssh-sync][ERROR] Missing project conf: ${CONF_FILE}" >&2
|
|
exit 1
|
|
fi
|
|
|
|
SSH_HOST_ALIAS="$(
|
|
jq -r ".smart_ide.remote_data_access.environments.${ENV_NAME}.ssh_host_alias // empty" "${CONF_FILE}"
|
|
)"
|
|
if [[ -z "${SSH_HOST_ALIAS}" || "${SSH_HOST_ALIAS}" == "null" ]]; then
|
|
echo "[remote-data-ssh-sync][ERROR] Missing ssh_host_alias for ${PROJECT_ID}/${ENV_NAME} in ${CONF_FILE}" >&2
|
|
exit 1
|
|
fi
|
|
|
|
mapfile -t ITEMS < <(jq -c ".smart_ide.remote_data_access.environments.${ENV_NAME}.remote_data_directories[]? // empty" "${CONF_FILE}")
|
|
if [[ ${#ITEMS[@]} -eq 0 ]]; then
|
|
echo "[remote-data-ssh-sync] No remote_data_directories configured for ${PROJECT_ID}/${ENV_NAME} (nothing to do)."
|
|
exit 0
|
|
fi
|
|
|
|
MIRROR_ROOT_ABS="${MIRROR_ROOT}"
|
|
if [[ "${MIRROR_ROOT_ABS}" != /* ]]; then
|
|
MIRROR_ROOT_ABS="$(cd "${ROOT}" && realpath -m "${MIRROR_ROOT_ABS}" 2>/dev/null || echo "${ROOT}/${MIRROR_ROOT_ABS}")"
|
|
fi
|
|
|
|
base_dir="${MIRROR_ROOT_ABS}/${PROJECT_ID}/${ENV_NAME}"
|
|
mkdir -p "${base_dir}"
|
|
|
|
tmp_items="$(mktemp -t remote-data-items.XXXXXX)"
|
|
cleanup() {
|
|
rm -f "${tmp_items}"
|
|
}
|
|
trap cleanup EXIT
|
|
|
|
echo "[remote-data-ssh-sync] projectId=${PROJECT_ID}"
|
|
echo "[remote-data-ssh-sync] env=${ENV_NAME}"
|
|
echo "[remote-data-ssh-sync] sshHostAlias=${SSH_HOST_ALIAS}"
|
|
echo "[remote-data-ssh-sync] mirrorRoot=${MIRROR_ROOT_ABS}"
|
|
echo "[remote-data-ssh-sync] items=${#ITEMS[@]}"
|
|
echo
|
|
|
|
should_ingest_role() {
|
|
local role="$1"
|
|
if [[ -z "${ROLES_CSV}" ]]; then
|
|
return 0
|
|
fi
|
|
local IFS=,
|
|
read -r -a allowed <<<"${ROLES_CSV}"
|
|
for r in "${allowed[@]}"; do
|
|
if [[ "$(echo "$r" | sed 's/[[:space:]]//g')" == "$role" ]]; then
|
|
return 0
|
|
fi
|
|
done
|
|
return 1
|
|
}
|
|
|
|
for item in "${ITEMS[@]}"; do
|
|
role="$(echo "$item" | jq -r '.role // empty')"
|
|
remote_path="$(echo "$item" | jq -r '.path_on_server // empty')"
|
|
if [[ -z "$role" || -z "$remote_path" || "$role" == "null" || "$remote_path" == "null" ]]; then
|
|
echo "[remote-data-ssh-sync][WARN] Skip invalid item: ${item}" >&2
|
|
continue
|
|
fi
|
|
if [[ "${remote_path}" != /* ]]; then
|
|
echo "[remote-data-ssh-sync][WARN] Skip non-absolute path_on_server for role '${role}': ${remote_path}" >&2
|
|
continue
|
|
fi
|
|
|
|
dest="${base_dir}/${role}"
|
|
|
|
echo "[remote-data-ssh-sync] rsync role=${role}"
|
|
echo " from: ${SSH_HOST_ALIAS}:${remote_path}"
|
|
echo " to : ${dest}"
|
|
|
|
# Trailing slash to mirror directory contents into dest/
|
|
src="${SSH_HOST_ALIAS}:${remote_path%/}/"
|
|
synced_json="false"
|
|
if [[ "${DRY_RUN}" == "true" ]]; then
|
|
echo " dry-run: rsync -a --delete -e \"ssh -o BatchMode=yes\" \"${src}\" \"${dest}/\""
|
|
else
|
|
mkdir -p "${dest}"
|
|
rsync -a --delete -e "ssh -o BatchMode=yes" "${src}" "${dest}/"
|
|
synced_json="true"
|
|
fi
|
|
echo "$item" | jq -c --arg dest "$dest" --argjson synced "${synced_json}" '{ role: (.role // ""), path_on_server: (.path_on_server // ""), dest: $dest, synced: $synced }' >>"${tmp_items}"
|
|
echo
|
|
done
|
|
|
|
manifest="${base_dir}/manifest.json"
|
|
started_at="$(date -Iseconds)"
|
|
if [[ -s "${tmp_items}" ]]; then
|
|
jq -s \
|
|
--arg projectId "${PROJECT_ID}" \
|
|
--arg env "${ENV_NAME}" \
|
|
--arg sshHostAlias "${SSH_HOST_ALIAS}" \
|
|
--arg startedAt "${started_at}" \
|
|
--argjson dryRun "$( [[ "${DRY_RUN}" == "true" ]] && echo true || echo false )" \
|
|
'{ projectId: $projectId, env: $env, sshHostAlias: $sshHostAlias, startedAt: $startedAt, dryRun: $dryRun, items: . }' \
|
|
"${tmp_items}" >"${manifest}"
|
|
else
|
|
jq -n \
|
|
--arg projectId "${PROJECT_ID}" \
|
|
--arg env "${ENV_NAME}" \
|
|
--arg sshHostAlias "${SSH_HOST_ALIAS}" \
|
|
--arg startedAt "${started_at}" \
|
|
--argjson dryRun "$( [[ "${DRY_RUN}" == "true" ]] && echo true || echo false )" \
|
|
'{ projectId: $projectId, env: $env, sshHostAlias: $sshHostAlias, startedAt: $startedAt, dryRun: $dryRun, items: [] }' \
|
|
>"${manifest}"
|
|
fi
|
|
echo "[remote-data-ssh-sync] manifest=${manifest}"
|
|
|
|
if [[ "${INGEST_ANYTHINGLLM}" != "true" ]]; then
|
|
echo "[remote-data-ssh-sync] AnythingLLM ingestion disabled (--no-anythingllm)."
|
|
exit 0
|
|
fi
|
|
|
|
workspace_slug="$(
|
|
jq -r --arg env "${ENV_NAME}" '
|
|
.smart_ide.anythingllm_workspace_slug as $s
|
|
| if ($s|type) == "string" then $s
|
|
elif ($s|type) == "object" then ($s[$env] // empty)
|
|
else empty end
|
|
' "${CONF_FILE}"
|
|
)"
|
|
workspace_slug="$(echo "${workspace_slug}" | sed 's/[[:space:]]//g')"
|
|
if [[ -z "${workspace_slug}" || "${workspace_slug}" == "null" ]]; then
|
|
echo "[remote-data-ssh-sync] AnythingLLM: missing smart_ide.anythingllm_workspace_slug for ${PROJECT_ID}/${ENV_NAME} in ${CONF_FILE} — skip."
|
|
exit 0
|
|
fi
|
|
|
|
# Optional host-level AnythingLLM env file (same convention as the post-merge hook).
|
|
if [[ -f "${HOME}/.config/4nk/anythingllm-sync.env" ]]; then
|
|
set -a
|
|
# shellcheck source=/dev/null
|
|
source "${HOME}/.config/4nk/anythingllm-sync.env"
|
|
set +a
|
|
fi
|
|
|
|
if [[ -z "${ANYTHINGLLM_BASE_URL:-}" || -z "${ANYTHINGLLM_API_KEY:-}" ]]; then
|
|
echo "[remote-data-ssh-sync] AnythingLLM: missing ANYTHINGLLM_BASE_URL or ANYTHINGLLM_API_KEY — skip."
|
|
exit 0
|
|
fi
|
|
|
|
echo
|
|
echo "[remote-data-ssh-sync] AnythingLLM ingest workspaceSlug=${workspace_slug}"
|
|
echo "[remote-data-ssh-sync] limits: maxFiles=${MAX_FILES} maxBytes=${MAX_BYTES}"
|
|
|
|
sync_script="${ROOT}/scripts/anythingllm-pull-sync/sync.mjs"
|
|
if [[ ! -f "${sync_script}" ]]; then
|
|
echo "[remote-data-ssh-sync][ERROR] Missing ${sync_script}" >&2
|
|
exit 1
|
|
fi
|
|
|
|
sanitize() {
|
|
echo "$1" | sed 's/[^A-Za-z0-9._-]/_/g'
|
|
}
|
|
|
|
ingest_ok=0
|
|
ingest_err=0
|
|
|
|
for item in "${ITEMS[@]}"; do
|
|
role="$(echo "$item" | jq -r '.role // empty')"
|
|
[[ -n "$role" && "$role" != "null" ]] || continue
|
|
if ! should_ingest_role "$role"; then
|
|
echo "[remote-data-ssh-sync] AnythingLLM: skip role=${role} (not in --roles)"
|
|
continue
|
|
fi
|
|
dest="${base_dir}/${role}"
|
|
if [[ ! -d "${dest}" ]]; then
|
|
echo "[remote-data-ssh-sync] AnythingLLM: skip role=${role} (missing dest dir: ${dest})" >&2
|
|
continue
|
|
fi
|
|
prefix="$(sanitize "${PROJECT_ID}")__$(sanitize "${ENV_NAME}")__$(sanitize "${role}")"
|
|
echo "[remote-data-ssh-sync] AnythingLLM: upload-all role=${role} (prefix=${prefix})"
|
|
|
|
if [[ "${DRY_RUN}" == "true" ]]; then
|
|
echo " dry-run: ANYTHINGLLM_WORKSPACE_SLUG=... node ${sync_script} --repo-root \"${dest}\" --upload-all --upload-prefix \"${prefix}\""
|
|
continue
|
|
fi
|
|
|
|
ANYTHINGLLM_WORKSPACE_SLUG="${workspace_slug}" \
|
|
ANYTHINGLLM_SYNC_MAX_FILES="${MAX_FILES}" \
|
|
ANYTHINGLLM_SYNC_MAX_FILE_BYTES="${MAX_BYTES}" \
|
|
node "${sync_script}" --repo-root "${dest}" --upload-all --upload-prefix "${prefix}" \
|
|
&& ingest_ok=$((ingest_ok + 1)) \
|
|
|| ingest_err=$((ingest_err + 1))
|
|
done
|
|
|
|
if [[ "${ingest_err}" -gt 0 ]]; then
|
|
echo "[remote-data-ssh-sync][ERROR] AnythingLLM ingestion failed for ${ingest_err} role(s); ok=${ingest_ok}" >&2
|
|
exit 1
|
|
fi
|
|
echo "[remote-data-ssh-sync] OK (AnythingLLM ok=${ingest_ok})"
|
|
|