375 lines
15 KiB
Python
375 lines
15 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Fetch inbox emails filtered by authorized senders (conf.json tickets.authorized_emails).
|
|
Only messages on or after MAIL_SINCE_DATE (default 10-Mar-2026) are considered. Does not use UNSEEN; does not mark as read
|
|
(uses BODY.PEEK[] so the server does not set \\Seen). Writes each matching message to projects/<id>/data/issues/
|
|
as JSON with filename <date>.<id>.<from>.<status> (status=pending). One file per message.
|
|
Id is deterministic from message (message_id or uid+date+from) so the same mail always gets the same base.
|
|
|
|
State: the spool directory (data/issues/) is the only record of what was already treated. We skip creating .pending
|
|
if .pending already exists or if .response exists for that base. If data/issues/ is empty, the script cannot know
|
|
what was treated before; the next run will create .pending for every matching message in the mailbox.
|
|
Usage: run from project root with GITEA_ISSUES_DIR and PROJECT_ROOT set (e.g. via tickets-fetch-inbox.sh).
|
|
Override date: MAIL_SINCE_DATE (IMAP format DD-Mon-YYYY, e.g. 10-Mar-2026).
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import email
|
|
import hashlib
|
|
import imaplib
|
|
import json
|
|
import os
|
|
import re
|
|
import sys
|
|
from datetime import datetime, timezone
|
|
from email.header import decode_header
|
|
from email.utils import parsedate_to_datetime
|
|
from pathlib import Path
|
|
|
|
sys.path.insert(0, str(Path(__file__).resolve().parent))
|
|
from mail_common import imap_search_criterion_all, load_imap_config, imap_ssl_context
|
|
from project_config import (
|
|
authorized_emails,
|
|
data_issues_dir,
|
|
ia_dev_root,
|
|
load_project_config,
|
|
project_dir,
|
|
project_root,
|
|
)
|
|
|
|
|
|
def decode_header_value(header: str | None) -> str:
|
|
if not header:
|
|
return ""
|
|
parts = decode_header(header)
|
|
result = []
|
|
for part, charset in parts:
|
|
if isinstance(part, bytes):
|
|
result.append(part.decode(charset or "utf-8", errors="replace"))
|
|
else:
|
|
result.append(part)
|
|
return "".join(result)
|
|
|
|
|
|
def parse_from_address(from_header: str) -> str:
|
|
"""Extract email address from From header (e.g. 'Name <user@host>' -> user@host)."""
|
|
if not from_header:
|
|
return ""
|
|
match = re.search(r"<([^>]+)>", from_header)
|
|
if match:
|
|
return match.group(1).strip().lower()
|
|
return from_header.strip().lower()
|
|
|
|
|
|
def get_text_body(msg: email.message.Message) -> str:
|
|
if msg.is_multipart():
|
|
for part in msg.walk():
|
|
if part.get_content_type() == "text/plain":
|
|
payload = part.get_payload(decode=True)
|
|
if payload:
|
|
return payload.decode(
|
|
part.get_content_charset() or "utf-8", errors="replace"
|
|
)
|
|
return ""
|
|
payload = msg.get_payload(decode=True)
|
|
if not payload:
|
|
return ""
|
|
return payload.decode(
|
|
msg.get_content_charset() or "utf-8", errors="replace"
|
|
)
|
|
|
|
|
|
def _extract_addresses(header_value: str) -> set[str]:
|
|
"""Extract email addresses from a header value (e.g. 'Name <user@host>, other@host')."""
|
|
if not header_value or not header_value.strip():
|
|
return set()
|
|
decoded = decode_header_value(header_value).strip()
|
|
# Angle-bracket: <...@...>
|
|
in_angle = re.findall(r"<([^>]+)>", decoded)
|
|
# Standalone addr-spec (simplified: local@domain)
|
|
plain = re.findall(r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9][a-zA-Z0-9.-]*[a-zA-Z0-9]", decoded)
|
|
out: set[str] = set()
|
|
for a in in_angle:
|
|
out.add(a.strip().lower())
|
|
for a in plain:
|
|
out.add(a.strip().lower())
|
|
return out
|
|
|
|
|
|
def is_sent_to_alias(msg: email.message.Message, filter_to: str) -> bool:
|
|
"""True if the message was sent to the configured alias (To/Cc/Delivered-To/etc.)."""
|
|
if not filter_to:
|
|
return True
|
|
filter_lower = filter_to.strip().lower()
|
|
headers_to_check = (
|
|
"To",
|
|
"Cc",
|
|
"Delivered-To",
|
|
"X-Original-To",
|
|
"X-Delivered-To",
|
|
"X-Envelope-To",
|
|
"Envelope-To",
|
|
)
|
|
for name in headers_to_check:
|
|
value = msg.get(name)
|
|
if value:
|
|
decoded = decode_header_value(value).lower()
|
|
if filter_lower in decoded:
|
|
return True
|
|
addrs = _extract_addresses(value)
|
|
if filter_lower in addrs:
|
|
return True
|
|
return False
|
|
|
|
|
|
def sanitize_from_for_filename(email_addr: str) -> str:
|
|
"""Filesystem-safe string from email (e.g. user@example.com -> user_example.com)."""
|
|
return re.sub(r"[^a-zA-Z0-9._-]", "_", email_addr.replace("@", "_"))
|
|
|
|
|
|
def generate_message_id(mid: str | None, uid_s: str, date_str: str, from_addr: str) -> str:
|
|
"""Deterministic 8-char id so the same message always gets the same base filename."""
|
|
raw = mid or f"{uid_s}_{date_str}_{from_addr}"
|
|
return hashlib.sha256(raw.encode("utf-8")).hexdigest()[:8]
|
|
|
|
|
|
def sanitize_attachment_filename(name: str) -> str:
|
|
"""Safe filename for attachment (no path, no dangerous chars)."""
|
|
if not name or not name.strip():
|
|
return "attachment"
|
|
base = Path(name).name
|
|
return re.sub(r"[^a-zA-Z0-9._-]", "_", base)[:200] or "attachment"
|
|
|
|
|
|
def get_attachments(msg: email.message.Message) -> list[tuple[str, bytes, str]]:
|
|
"""Return list of (filename, payload_bytes, content_type) for each attachment."""
|
|
result: list[tuple[str, bytes, str]] = []
|
|
for part in msg.walk():
|
|
content_type = (part.get_content_type() or "").lower()
|
|
if content_type.startswith("multipart/"):
|
|
continue
|
|
filename = part.get_filename()
|
|
if not filename:
|
|
# Optional: treat inline images etc. with Content-Disposition attachment
|
|
disp = part.get("Content-Disposition") or ""
|
|
if "attachment" in disp.lower():
|
|
ext = ""
|
|
if "image" in content_type:
|
|
ext = ".bin" if "octet-stream" in content_type else ".img"
|
|
filename = f"attachment{ext}"
|
|
else:
|
|
continue
|
|
filename = decode_header_value(filename).strip()
|
|
if not filename:
|
|
continue
|
|
payload = part.get_payload(decode=True)
|
|
if payload is None:
|
|
continue
|
|
result.append((filename, payload, content_type))
|
|
return result
|
|
|
|
|
|
def parse_references(refs: str | None) -> list[str]:
|
|
if not refs:
|
|
return []
|
|
return [x.strip() for x in re.split(r"\s+", refs) if x.strip()]
|
|
|
|
|
|
def main() -> int:
|
|
conf = load_project_config()
|
|
if not conf:
|
|
root = project_root()
|
|
ia_dev = ia_dev_root()
|
|
slug_path = root / "ai_project_id"
|
|
if not slug_path.is_file():
|
|
slug_path = root / ".ia_project"
|
|
slug = (slug_path.read_text(encoding="utf-8").strip() if slug_path.is_file() else "") or os.environ.get("IA_PROJECT", "").strip()
|
|
conf_path = ia_dev / "projects" / (slug or "<slug>") / "conf.json"
|
|
print("[tickets-fetch-inbox] No project config.", file=sys.stderr)
|
|
print(f"[tickets-fetch-inbox] PROJECT_ROOT={project_root()!s}, slug={slug!r}, looked at {conf_path}", file=sys.stderr)
|
|
return 1
|
|
auth = authorized_emails()
|
|
filter_to = (auth.get("to") or "").strip().lower()
|
|
from_list = auth.get("from")
|
|
# from must be a list of addresses, one per entry (not a single string with several addresses).
|
|
if isinstance(from_list, list):
|
|
allowed_from = {str(a).strip().lower() for a in from_list if a}
|
|
elif isinstance(from_list, str):
|
|
allowed_from = {a.strip().lower() for a in re.split(r"[,;]", from_list) if a.strip()}
|
|
else:
|
|
allowed_from = set()
|
|
if not filter_to or not allowed_from:
|
|
print(
|
|
"[tickets-fetch-inbox] tickets.authorized_emails.to and .from required in conf.json.",
|
|
file=sys.stderr,
|
|
)
|
|
return 1
|
|
|
|
cfg = load_imap_config()
|
|
if not cfg["user"] or not cfg["password"]:
|
|
print("[tickets-fetch-inbox] IMAP_USER and IMAP_PASSWORD required.", file=sys.stderr)
|
|
return 1
|
|
|
|
spool = data_issues_dir()
|
|
pd = project_dir()
|
|
if pd is None:
|
|
print(f"[tickets-fetch-inbox] WARNING: project_dir is None, using fallback spool {spool}", file=sys.stderr)
|
|
else:
|
|
print(f"[tickets-fetch-inbox] Spool directory: {spool}")
|
|
spool.mkdir(parents=True, exist_ok=True)
|
|
|
|
mail = imaplib.IMAP4(cfg["host"], int(cfg["port"]))
|
|
if cfg["use_starttls"]:
|
|
mail.starttls(imap_ssl_context(cfg.get("ssl_verify", True)))
|
|
mail.login(cfg["user"], cfg["password"])
|
|
mail.select("INBOX")
|
|
# Do not use UNSEEN; fetch messages on or after MAIL_SINCE_DATE (default 10-Mar-2026). Filter by authorized senders only.
|
|
# Use BODY.PEEK[] instead of RFC822 so the server does not set \Seen (emails stay "unread").
|
|
since_criterion = imap_search_criterion_all()
|
|
_, nums = mail.search(None, since_criterion)
|
|
ids = nums[0].split()
|
|
written = 0
|
|
skipped_fetch = 0
|
|
skipped_filter_to = 0
|
|
skipped_from = 0
|
|
skipped_pending = 0
|
|
skipped_response = 0
|
|
debug_to = os.environ.get("TICKETS_FETCH_DEBUG", "").strip() == "1"
|
|
debug_to_count = 0
|
|
debug_to_max = 3
|
|
for uid in ids:
|
|
uid_s = uid.decode("ascii")
|
|
_, data = mail.fetch(uid, "(BODY.PEEK[])")
|
|
if not data or not data[0]:
|
|
skipped_fetch += 1
|
|
continue
|
|
raw = data[0]
|
|
raw_bytes = None
|
|
if isinstance(raw, tuple):
|
|
if len(raw) >= 2 and isinstance(raw[1], bytes):
|
|
raw_bytes = raw[1]
|
|
elif len(raw) >= 2 and isinstance(raw[1], str):
|
|
raw_bytes = raw[1].encode("utf-8", errors="replace")
|
|
elif isinstance(raw, bytes):
|
|
raw_bytes = raw
|
|
if not raw_bytes:
|
|
skipped_fetch += 1
|
|
continue
|
|
try:
|
|
msg = email.message_from_bytes(raw_bytes)
|
|
except Exception:
|
|
skipped_fetch += 1
|
|
continue
|
|
if not is_sent_to_alias(msg, filter_to):
|
|
skipped_filter_to += 1
|
|
if debug_to and debug_to_count < debug_to_max:
|
|
debug_to_count += 1
|
|
h_to = decode_header_value(msg.get("To"))
|
|
h_dt = decode_header_value(msg.get("Delivered-To"))
|
|
h_xo = decode_header_value(msg.get("X-Original-To"))
|
|
h_from = decode_header_value(msg.get("From"))
|
|
received = msg.get_all("Received") or []
|
|
received_sample = " | ".join(decode_header_value(r)[:80] for r in received[:2])
|
|
print(
|
|
f"[tickets-fetch-inbox] DEBUG not_to_alias (sample {debug_to_count}/{debug_to_max}): "
|
|
f"filter_to={filter_to!r} | To={h_to!r} | Delivered-To={h_dt!r} | X-Original-To={h_xo!r} | From={h_from!r}",
|
|
file=sys.stderr,
|
|
)
|
|
print(f"[tickets-fetch-inbox] DEBUG Received (first 2): {received_sample!r}", file=sys.stderr)
|
|
continue
|
|
from_raw = decode_header_value(msg.get("From"))
|
|
from_addr = parse_from_address(from_raw)
|
|
if from_addr not in allowed_from:
|
|
skipped_from += 1
|
|
continue
|
|
mid = (msg.get("Message-ID") or "").strip()
|
|
to_raw = decode_header_value(msg.get("To"))
|
|
to_addrs = [a.strip() for a in re.split(r"[,;]", to_raw) if a.strip()]
|
|
subj = decode_header_value(msg.get("Subject"))
|
|
date_h = decode_header_value(msg.get("Date"))
|
|
refs = parse_references(msg.get("References"))
|
|
in_reply_to = (msg.get("In-Reply-To") or "").strip() or None
|
|
body = get_text_body(msg)
|
|
try:
|
|
if date_h:
|
|
dt = parsedate_to_datetime(date_h)
|
|
date_str = dt.strftime("%Y-%m-%dT%H%M%S")
|
|
else:
|
|
date_str = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H%M%S")
|
|
except Exception:
|
|
date_str = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H%M%S")
|
|
from_safe = sanitize_from_for_filename(from_addr)
|
|
msg_id_short = generate_message_id(mid, uid_s, date_str, from_addr)
|
|
base = f"{date_str}.{msg_id_short}.{from_safe}"
|
|
path = spool / f"{base}.pending"
|
|
if path.exists():
|
|
skipped_pending += 1
|
|
continue
|
|
# Already treated: .response exists (we don't keep .pending after replying).
|
|
if (spool / f"{base}.response").exists():
|
|
skipped_response += 1
|
|
continue
|
|
created_at = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
|
attachments_meta: list[dict[str, str | int]] = []
|
|
attachment_parts = get_attachments(msg)
|
|
if attachment_parts:
|
|
att_dir = spool / f"{base}.d"
|
|
att_dir.mkdir(parents=True, exist_ok=True)
|
|
for idx, (orig_name, payload_bytes, content_type) in enumerate(attachment_parts):
|
|
safe_name = sanitize_attachment_filename(orig_name)
|
|
stored_name = f"{idx}_{safe_name}"
|
|
stored_path = att_dir / stored_name
|
|
stored_path.write_bytes(payload_bytes)
|
|
rel_path = f"{base}.d/{stored_name}"
|
|
attachments_meta.append({
|
|
"filename": orig_name,
|
|
"path": rel_path,
|
|
"content_type": content_type,
|
|
"size": len(payload_bytes),
|
|
})
|
|
payload = {
|
|
"version": 1,
|
|
"type": "incoming",
|
|
"id": msg_id_short,
|
|
"message_id": mid or "",
|
|
"from": from_addr,
|
|
"to": to_addrs,
|
|
"subject": subj,
|
|
"date": date_h or "",
|
|
"body": body or "",
|
|
"references": refs,
|
|
"in_reply_to": in_reply_to,
|
|
"uid": uid_s,
|
|
"created_at": created_at,
|
|
"issue_number": None,
|
|
"status": "pending",
|
|
"attachments": attachments_meta,
|
|
}
|
|
path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
|
|
written += 1
|
|
print(f"[tickets-fetch-inbox] Wrote {path.name}")
|
|
|
|
mail.logout()
|
|
print(f"[tickets-fetch-inbox] Done. Wrote {written} new message(s) to {spool}.")
|
|
if skipped_fetch or skipped_filter_to or skipped_from or skipped_pending or skipped_response:
|
|
print(
|
|
f"[tickets-fetch-inbox] Skipped: fetch/parse={skipped_fetch}, not_to_alias={skipped_filter_to}, "
|
|
f"from_not_allowed={skipped_from}, pending_exists={skipped_pending}, response_exists={skipped_response}."
|
|
)
|
|
if skipped_filter_to > 0 and written == 0 and skipped_from == 0:
|
|
print(
|
|
"[tickets-fetch-inbox] All messages were excluded by not_to_alias: none have To/Delivered-To/X-Original-To matching the configured alias.",
|
|
file=sys.stderr,
|
|
)
|
|
print(
|
|
"[tickets-fetch-inbox] The IMAP account you are using receives mail for another address (e.g. Delivered-To shows that address). "
|
|
"To fetch messages sent to authorized_emails.to, connect IMAP to the mailbox that receives mail for that alias (same address as authorized_emails.to).",
|
|
file=sys.stderr,
|
|
)
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|