# Copy to services/chandra/upstream/local.env (loaded by pydantic-settings via find_dotenv). # https://github.com/datalab-to/chandra — local Hugging Face inference # Hugging Face model id (weights downloaded on first run) MODEL_CHECKPOINT=datalab-to/chandra-ocr-2 # Optional: force device, e.g. cuda:0, cpu # TORCH_DEVICE=cuda:0 # Optional: flash attention — requires compatible GPU + flash-attn installed # TORCH_ATTN=flash_attention_2 MAX_OUTPUT_TOKENS=12384 # If the checkpoint were gated (not the case for chandra-ocr-2 by default), use: # HF_TOKEN= # --- vLLM only (not used with --method hf) --- # VLLM_API_BASE=http://localhost:8000/v1 # VLLM_MODEL_NAME=chandra # VLLM_GPUS=0