"""Import parsed ChatGPT chat logs into Lyra's memory. Consumes the parser's `{"title": ..., "messages": [{"role", "content"}]}` format (one JSON file per conversation). Each conversation becomes a Lyra session; each text message becomes an exchange. Embeddings are batched. Import is idempotent — a conversation already present (by session id) is skipped. Timestamps: this format carries no dates, so imported exchanges are stamped with `created_at` (default: now). A future timestamped export will let era memory group by real calendar time; pass real per-message dates then. """ from __future__ import annotations import json import sys from datetime import datetime, timezone from pathlib import Path from lyra import llm, logbus, memory EMBED_BATCH = 64 EMBED_CHAR_CAP = 6000 # cap embed input size; full content is still stored # Message content types worth keeping from a raw ChatGPT export. We drop # 'thoughts' (internal chain-of-thought) and 'reasoning_recap' (meta). KEEP_CONTENT_TYPES = {"text", "multimodal_text"} def _session_id(path: Path) -> str: """Stable id derived from the filename, so re-imports don't duplicate.""" return "import-" + path.stem def _clean_messages(messages: list[dict]) -> list[tuple[str, str]]: out: list[tuple[str, str]] = [] for m in messages: role = m.get("role") if role not in ("user", "assistant"): continue content = (m.get("content") or "").strip() if not content or content.startswith('{"content_type"'): # skip empty / image assets continue out.append((role, content)) return out def import_file(path: Path, created_at: str) -> int: """Import one conversation file. Returns exchanges added (0 if skipped/empty).""" data = json.loads(path.read_text(encoding="utf-8")) session_id = _session_id(path) if memory.history(session_id): # already imported return 0 msgs = _clean_messages(data.get("messages", [])) if not msgs: return 0 memory.ensure_session(session_id, name=data.get("title") or path.stem) rows: list[tuple[str, str, list[float], str]] = [] for i in range(0, len(msgs), EMBED_BATCH): batch = msgs[i : i + EMBED_BATCH] embeddings = llm.embed([content[:EMBED_CHAR_CAP] for _, content in batch]) for (role, content), emb in zip(batch, embeddings): rows.append((role, content, emb, created_at)) return memory.add_exchanges_bulk(session_id, rows) def import_dir(dirpath: str | Path, created_at: str | None = None) -> dict: """Import every *.json under dirpath (recursively). Returns a small report.""" created_at = created_at or datetime.now(timezone.utc).isoformat() files = sorted(Path(dirpath).rglob("*.json")) sessions, exchanges = 0, 0 for path in files: added = import_file(path, created_at) if added: sessions += 1 exchanges += added logbus.log( "info", "import complete", dir=str(dirpath), files=len(files), sessions=sessions, exchanges=exchanges, ) return {"files": len(files), "sessions_imported": sessions, "exchanges": exchanges} # --- Raw ChatGPT export (sharded conversations-*.json with timestamps) --- def _ts_to_iso(ts: float | None, fallback: str) -> str: if not ts: return fallback return datetime.fromtimestamp(ts, tz=timezone.utc).isoformat() def _message_text(msg: dict) -> str | None: """Extract plain text from a ChatGPT message node, or None to skip it.""" content = msg.get("content") or {} if content.get("content_type") not in KEEP_CONTENT_TYPES: return None parts = [p for p in (content.get("parts") or []) if isinstance(p, str) and p.strip()] text = "\n".join(parts).strip() return text or None def _convo_rows(convo: dict) -> list[tuple[float, str, str]]: """(create_time, role, text) for each keepable message, chronologically.""" rows: list[tuple[float, str, str]] = [] conv_ct = convo.get("create_time") or 0 for node in convo.get("mapping", {}).values(): msg = node.get("message") if not msg: continue role = (msg.get("author") or {}).get("role") if role not in ("user", "assistant"): continue text = _message_text(msg) if text is None: continue rows.append((msg.get("create_time") or conv_ct, role, text)) rows.sort(key=lambda r: r[0] or 0) return rows def import_conversation(convo: dict) -> int: """Import one raw-export conversation. Idempotent by conversation_id.""" session_id = convo.get("conversation_id") or convo.get("id") if not session_id or memory.history(session_id): return 0 rows = _convo_rows(convo) if not rows: return 0 memory.ensure_session(session_id, name=convo.get("title") or "untitled") fallback = datetime.now(timezone.utc).isoformat() exchanges: list[tuple[str, str, list[float], str]] = [] for i in range(0, len(rows), EMBED_BATCH): batch = rows[i : i + EMBED_BATCH] embeddings = llm.embed([text[:EMBED_CHAR_CAP] for _, _, text in batch]) for (ts, role, text), emb in zip(batch, embeddings): exchanges.append((role, text, emb, _ts_to_iso(ts, fallback))) return memory.add_exchanges_bulk(session_id, exchanges) def import_export(export_dir: str | Path, limit: int | None = None) -> dict: """Import a raw ChatGPT export directory (sharded conversations-*.json).""" shards = sorted(Path(export_dir).glob("conversations-*.json")) convos, exchanges, seen = 0, 0, 0 for shard in shards: for convo in json.loads(shard.read_text(encoding="utf-8")): if limit is not None and seen >= limit: break seen += 1 added = import_conversation(convo) if added: convos += 1 exchanges += added if limit is not None and seen >= limit: break logbus.log( "info", "export import complete", shards=len(shards), conversations=convos, exchanges=exchanges, ) return {"shards": len(shards), "conversations_imported": convos, "exchanges": exchanges} def main() -> int: if len(sys.argv) < 2: print("usage: lyra-import [limit]", file=sys.stderr) return 2 path = Path(sys.argv[1]) limit = int(sys.argv[2]) if len(sys.argv) > 2 else None # A raw ChatGPT export has sharded conversations-*.json; otherwise treat the # directory as legacy {title, messages} files. if list(path.glob("conversations-*.json")): report = import_export(path, limit=limit) else: report = import_dir(path) print(report) return 0 if __name__ == "__main__": raise SystemExit(main())