This guide shows exactly how to:
.pst file.jsonl)PST file -> readpst (extract .eml) -> Python parser (JSONL) -> optional Qwen/Gemma extraction -> embedding model (nomic-embed-text) -> Chroma vector DB -> semantic search / RAG
PowerShell (Admin): wsl --install
Reboot if prompted, then open Ubuntu from Start menu.
sudo apt update sudo apt install -y pst-utils python3-pip python3-venv mkdir -p ~/pst_project cd ~/pst_project python3 -m venv .venv source .venv/bin/activate pip install --upgrade pip pip install chromadb requests beautifulsoup4
Download: https://ollama.com/download/windows
Then in PowerShell:
ollama pull qwen2.5:7b ollama pull gemma2:9b ollama pull nomic-embed-text
Example PST path from Windows into WSL:
cd ~/pst_project mkdir -p extracted_mail readpst -r -u -o extracted_mail /mnt/c/Users/YOUR_USERNAME/Documents/archive.pst
.eml files under ~/pst_project/extracted_mail.If your machine is already Ubuntu, use this directly:
sudo apt update sudo apt install -y pst-utils python3-pip python3-venv mkdir -p ~/pst_project && cd ~/pst_project python3 -m venv .venv source .venv/bin/activate pip install --upgrade pip pip install chromadb requests beautifulsoup4 mkdir -p extracted_mail readpst -r -u -o extracted_mail /path/to/archive.pst
Create parse_eml_to_jsonl.py in ~/pst_project:
from pathlib import Path
from email import policy
from email.parser import BytesParser
from bs4 import BeautifulSoup
import json
SRC = Path("extracted_mail")
OUT = Path("emails.jsonl")
def html_to_text(html: str) -> str:
return BeautifulSoup(html, "html.parser").get_text("\n", strip=True)
def body_from_msg(msg):
text_parts, html_parts = [], []
if msg.is_multipart():
for part in msg.walk():
ctype = part.get_content_type()
disp = part.get_content_disposition()
if disp == "attachment":
continue
try:
content = part.get_content()
except Exception:
continue
if ctype == "text/plain":
text_parts.append(content)
elif ctype == "text/html":
html_parts.append(content)
else:
try:
content = msg.get_content()
if msg.get_content_type() == "text/plain":
text_parts.append(content)
elif msg.get_content_type() == "text/html":
html_parts.append(content)
except Exception:
pass
if text_parts:
return "\n".join(text_parts).strip()
if html_parts:
return "\n".join(html_to_text(h) for h in html_parts).strip()
return ""
with OUT.open("w", encoding="utf-8") as out:
i = 0
for p in SRC.rglob("*.eml"):
try:
with p.open("rb") as f:
msg = BytesParser(policy=policy.default).parse(f)
rec = {
"id": i,
"file": str(p),
"subject": msg.get("subject", ""),
"from": msg.get("from", ""),
"to": msg.get("to", ""),
"cc": msg.get("cc", ""),
"date": msg.get("date", ""),
"message_id": msg.get("message-id", ""),
"body": body_from_msg(msg)
}
out.write(json.dumps(rec, ensure_ascii=False) + "\n")
i += 1
except Exception as e:
print(f"skip {p}: {e}")
print(f"done: {i} messages -> {OUT}")
Run:
cd ~/pst_project source .venv/bin/activate python parse_eml_to_jsonl.py
Use this only if you want enriched metadata (summary, category, action items, entities).
Create enrich_with_qwen.py:
import json, requests
from pathlib import Path
INP = Path("emails.jsonl")
OUT = Path("emails.enriched.jsonl")
OLLAMA = "http://localhost:11434"
MODEL = "qwen2.5:7b" # or gemma2:9b
prompt_t = """Extract JSON with keys:
summary, category, action_items, people, organizations, dates, sensitive_info_present.
Email:\n{email}\nReturn JSON only."""
with INP.open("r", encoding="utf-8") as f, OUT.open("w", encoding="utf-8") as out:
for line in f:
rec = json.loads(line)
email_text = f"Subject: {rec.get('subject','')}\nFrom: {rec.get('from','')}\nDate: {rec.get('date','')}\n\n{rec.get('body','')[:8000]}"
r = requests.post(f"{OLLAMA}/api/generate", json={
"model": MODEL,
"prompt": prompt_t.format(email=email_text),
"stream": False
}, timeout=180)
r.raise_for_status()
rec["llm_extraction_raw"] = r.json().get("response", "")
out.write(json.dumps(rec, ensure_ascii=False) + "\n")
print("done")
Create build_vector_db.py:
import json, requests, chromadb
from pathlib import Path
INP = Path("emails.jsonl") # or emails.enriched.jsonl
DB_DIR = "./email_vector_db"
COL = "pst_emails"
OLLAMA = "http://localhost:11434"
EMBED_MODEL = "nomic-embed-text"
client = chromadb.PersistentClient(path=DB_DIR)
collection = client.get_or_create_collection(COL)
def embed(text):
r = requests.post(f"{OLLAMA}/api/embeddings", json={
"model": EMBED_MODEL,
"prompt": text
}, timeout=120)
r.raise_for_status()
return r.json()["embedding"]
def render_doc(rec):
return f"""Subject: {rec.get('subject','')}
From: {rec.get('from','')}
To: {rec.get('to','')}
CC: {rec.get('cc','')}
Date: {rec.get('date','')}
{rec.get('body','')}
""".strip()
batch_ids, batch_docs, batch_embs, batch_meta = [], [], [], []
with INP.open("r", encoding="utf-8") as f:
for i, line in enumerate(f):
rec = json.loads(line)
doc = render_doc(rec)
if len(doc) < 20:
continue
emb = embed(doc)
batch_ids.append(str(rec.get("id", i)))
batch_docs.append(doc)
batch_embs.append(emb)
batch_meta.append({
"subject": rec.get("subject", ""),
"from": rec.get("from", ""),
"to": rec.get("to", ""),
"date": rec.get("date", ""),
"file": rec.get("file", "")
})
if len(batch_ids) >= 50:
collection.add(ids=batch_ids, documents=batch_docs, embeddings=batch_embs, metadatas=batch_meta)
batch_ids, batch_docs, batch_embs, batch_meta = [], [], [], []
if batch_ids:
collection.add(ids=batch_ids, documents=batch_docs, embeddings=batch_embs, metadatas=batch_meta)
print("vector DB ready at", DB_DIR)
Run:
source .venv/bin/activate python build_vector_db.py
Create search_vector_db.py:
import requests, chromadb
OLLAMA = "http://localhost:11434"
EMBED_MODEL = "nomic-embed-text"
client = chromadb.PersistentClient(path="./email_vector_db")
collection = client.get_collection("pst_emails")
def embed(text):
r = requests.post(f"{OLLAMA}/api/embeddings", json={"model": EMBED_MODEL, "prompt": text}, timeout=120)
r.raise_for_status()
return r.json()["embedding"]
q = input("Search query: ").strip()
res = collection.query(query_embeddings=[embed(q)], n_results=8)
for doc, meta in zip(res["documents"][0], res["metadatas"][0]):
print("\n" + "="*80)
print("Subject:", meta.get("subject"))
print("From:", meta.get("from"))
print("Date:", meta.get("date"))
print("File:", meta.get("file"))
print("-"*80)
print(doc[:1200])
readpst: command not found → install pst-utils (Ubuntu/WSL)..eml output → verify PST path and permissions; run with absolute path.http://localhost:11434.message_id before inserting into vector DB.emails.jsonl exists and has expected record countemail_vector_db/ directory exists