# Copy to services/chandra/upstream/local.env (loaded by pydantic-settings via find_dotenv).
# https://github.com/datalab-to/chandra — local Hugging Face inference

# Hugging Face model id (weights downloaded on first run)
MODEL_CHECKPOINT=datalab-to/chandra-ocr-2

# Optional: force device, e.g. cuda:0, cpu
# TORCH_DEVICE=cuda:0

# Optional: flash attention — requires compatible GPU + flash-attn installed
# TORCH_ATTN=flash_attention_2

MAX_OUTPUT_TOKENS=12384

# If the checkpoint were gated (not the case for chandra-ocr-2 by default), use:
# HF_TOKEN=

# --- vLLM only (not used with --method hf) ---
# VLLM_API_BASE=http://localhost:8000/v1
# VLLM_MODEL_NAME=chandra
# VLLM_GPUS=0