From 388d4351582a85b3a03155aa9c2e80da4cf4df14 Mon Sep 17 00:00:00 2001 From: saymrwulf Date: Wed, 1 Apr 2026 18:57:58 +0200 Subject: [PATCH] Add teaching docs for analytics code --- README.md | 88 +- ct_master_report.py | 3 - ct_monograph_report.py | 24 +- ct_scan.py | 99 +- teachingNoobs/CURRICULUM.md | 44 + teachingNoobs/build_teaching_docs.py | 451 ++++ teachingNoobs/ct_caa_analysis.md | 531 ++++ teachingNoobs/ct_dns_utils.md | 501 ++++ teachingNoobs/ct_focus_subjects.md | 960 ++++++++ teachingNoobs/ct_lineage_report.md | 2184 +++++++++++++++++ teachingNoobs/ct_master_report.md | 1170 +++++++++ teachingNoobs/ct_monograph_report.md | 3349 ++++++++++++++++++++++++++ teachingNoobs/ct_scan.md | 2168 +++++++++++++++++ teachingNoobs/ct_usage_assessment.md | 645 +++++ 14 files changed, 12189 insertions(+), 28 deletions(-) create mode 100644 teachingNoobs/CURRICULUM.md create mode 100644 teachingNoobs/build_teaching_docs.py create mode 100644 teachingNoobs/ct_caa_analysis.md create mode 100644 teachingNoobs/ct_dns_utils.md create mode 100644 teachingNoobs/ct_focus_subjects.md create mode 100644 teachingNoobs/ct_lineage_report.md create mode 100644 teachingNoobs/ct_master_report.md create mode 100644 teachingNoobs/ct_monograph_report.md create mode 100644 teachingNoobs/ct_scan.md create mode 100644 teachingNoobs/ct_usage_assessment.md diff --git a/README.md b/README.md index df6a3f0..cb478a1 100644 --- a/README.md +++ b/README.md @@ -19,7 +19,7 @@ The project is designed for public source control: None of those paths should be committed. -## What You Need On A Fresh Machine +## What You Need On A Fresh macOS Machine ### Required software @@ -38,7 +38,71 @@ None of those paths should be committed. If `xelatex` is missing, the Markdown and LaTeX outputs can still be generated, but the PDF targets will fail. -## Fresh Install On Another Computer +### Network access required + +- outbound TCP access to `crt.sh:5432` +- public DNS resolution for `dig` + +The scanner reads Certificate Transparency data directly from the public `certwatch` PostgreSQL service on `crt.sh` using guest access. If that TCP path is blocked, the certificate part of the run will fail even if normal web browsing works. + +## Clean-Room Operator Checklist + +Use this sequence if you need to reproduce the same output structure on another Mac without any extra guidance: + +1. Install the required macOS system tools. +2. Clone the repository. +3. Create the Python virtual environment and install Python dependencies. +4. Create the local-only config files. +5. Put the real search terms into `domains.local.txt`. +6. Optionally put the focused Subject-CN cohort into `focus_subjects.local.txt`. +7. Run `make monograph`. +8. Read the outputs from `output/corpus/`. + +Expected final outputs: + +- `output/corpus/monograph.md` +- `output/corpus/monograph.tex` +- `output/corpus/monograph.pdf` + +The PDF build no longer depends on macOS-only fonts. + +## macOS Install Recipe + +Install Apple command-line tools first: + +```bash +xcode-select --install +``` + +If Homebrew is not already installed, install it from `https://brew.sh`, then install the required tools: + +```bash +brew install python make +brew install --cask mactex-no-gui +``` + +Notes: + +- `git`, `make`, and `dig` are usually already present once Apple command-line tools are installed. +- `mactex-no-gui` provides `xelatex`. +- If `xelatex` is still not on your `PATH` after installation, open a new shell and re-run `which xelatex`. + +## Preflight Checks + +Run these checks before the first full build: + +```bash +python3 --version +git --version +make --version +dig -v +xelatex --version +nc -vz crt.sh 5432 +``` + +If the last command fails, the CT query layer will not be able to reach the public `certwatch` database. + +## Fresh Install On Another Mac Clone the repository from your chosen remote and enter the directory: @@ -63,6 +127,26 @@ Then edit `domains.local.txt` and replace the placeholder values with the real s If you want the monograph to analyse a remembered or suspicious Subject-CN cohort as well, edit `focus_subjects.local.txt` too. The format is one Subject CN per line, optionally followed by analyst notes in parentheses. +## Fastest End-To-End Run + +If the Mac already has the required system tools installed, this is the shortest full path: + +```bash +git clone +cd CertTransparencySearch +make bootstrap +make init-config +# edit domains.local.txt +# optionally edit focus_subjects.local.txt +make monograph +``` + +The canonical results will then be in: + +- `output/corpus/monograph.md` +- `output/corpus/monograph.tex` +- `output/corpus/monograph.pdf` + ## Local Search Terms The tracked file is: diff --git a/ct_master_report.py b/ct_master_report.py index f50f073..d9daecd 100644 --- a/ct_master_report.py +++ b/ct_master_report.py @@ -643,9 +643,6 @@ def render_latex(path: Path, report: dict[str, object]) -> None: r"\usepackage{titlesec}", r"\usepackage[most]{tcolorbox}", r"\defaultfontfeatures{Ligatures=TeX,Scale=MatchLowercase}", - r"\setmainfont{Palatino}", - r"\setsansfont{Avenir Next}", - r"\setmonofont{Menlo}", r"\definecolor{Ink}{HTML}{17202A}", r"\definecolor{Muted}{HTML}{667085}", r"\definecolor{Line}{HTML}{D0D5DD}", diff --git a/ct_monograph_report.py b/ct_monograph_report.py index f1cec3b..37fdfd6 100644 --- a/ct_monograph_report.py +++ b/ct_monograph_report.py @@ -1703,9 +1703,6 @@ def render_latex( r"\usepackage[most]{tcolorbox}", r"\usepackage{pdfpages}", r"\defaultfontfeatures{Ligatures=TeX,Scale=MatchLowercase}", - r"\setmainfont{Palatino}", - r"\setsansfont{Avenir Next}", - r"\setmonofont{Menlo}", r"\definecolor{Ink}{HTML}{17202A}", r"\definecolor{Muted}{HTML}{667085}", r"\definecolor{Line}{HTML}{D0D5DD}", @@ -1723,8 +1720,11 @@ def render_latex( r"\pagestyle{plain}", r"\titleformat{\section}{\sffamily\bfseries\LARGE\color{Ink}\raggedright}{\thesection}{0.8em}{}", r"\titleformat{\subsection}{\sffamily\bfseries\Large\color{Ink}\raggedright}{\thesubsection}{0.8em}{}", + r"\titleformat{\subsubsection}{\sffamily\bfseries\normalsize\color{Ink}\raggedright}{\thesubsubsection}{0.8em}{}", r"\tcbset{panel/.style={enhanced,breakable,boxrule=0.55pt,arc=3pt,left=9pt,right=9pt,top=8pt,bottom=8pt,colback=white,colframe=Line}}", - r"\newcommand{\SummaryBox}[1]{\begin{tcolorbox}[panel,colback=Panel]#1\end{tcolorbox}}", + r"\newcommand{\SummaryBox}[1]{\begin{tcolorbox}[enhanced,boxrule=0.55pt,arc=3pt,left=9pt,right=9pt,top=8pt,bottom=8pt,colback=Panel,colframe=Line]#1\end{tcolorbox}}", + r"\newcommand{\SoftSubsection}[1]{\Needspace{12\baselineskip}\subsection{#1}}", + r"\newcommand{\SoftSubsubsection}[1]{\Needspace{10\baselineskip}\subsubsection{#1}}", r"\begin{document}", r"\begin{titlepage}", r"\vspace*{16mm}", @@ -1743,7 +1743,11 @@ def render_latex( + rf"{purpose_summary.category_counts.get('tls_server_and_client', 0)} certificates from templates that also permit client-certificate use." + r"}", r"\end{titlepage}", + r"\begingroup", + r"\small", + r"\setlength{\parskip}{2pt}", r"\tableofcontents", + r"\endgroup", r"\clearpage", ] @@ -2744,7 +2748,17 @@ def render_latex( r"\end{document}", ] ) - args.latex_output.write_text("\n".join(lines) + "\n", encoding="utf-8") + def soften_heading(line: str) -> str: + if line.startswith(r"\subsection{"): + return line.replace(r"\subsection{", r"\SoftSubsection{", 1) + if line.startswith(r"\subsubsection{"): + return line.replace(r"\subsubsection{", r"\SoftSubsubsection{", 1) + return line + + args.latex_output.write_text( + "\n".join(soften_heading(line) for line in lines) + "\n", + encoding="utf-8", + ) def main() -> int: diff --git a/ct_scan.py b/ct_scan.py index ec88f2d..ab6acef 100644 --- a/ct_scan.py +++ b/ct_scan.py @@ -755,7 +755,7 @@ def build_san_tree_lines(san_entries: list[str]) -> list[str]: return build_san_tree_lines_with_style(san_entries, ascii_only=False) -def build_san_tree_lines_with_style(san_entries: list[str], ascii_only: bool) -> list[str]: +def build_san_tree_units_with_style(san_entries: list[str], ascii_only: bool) -> list[list[str]]: dns_entries = sorted({entry[4:] for entry in san_entries if entry.startswith("DNS:")}) other_entries = sorted({entry for entry in san_entries if not entry.startswith("DNS:")}) tree: dict[str, Any] = {} @@ -784,11 +784,55 @@ def build_san_tree_lines_with_style(san_entries: list[str], ascii_only: bool) -> lines.extend(render(child, child_prefix)) return lines - lines = render(tree) + units: list[list[str]] = [] + for key in sorted(tree.keys(), key=str.casefold): + units.append(render({key: tree[key]})) for entry in other_entries: - lines.append(f"{'*' if ascii_only else '•'} {entry}") - if not lines: - lines.append(f"{'*' if ascii_only else '•'} -") + units.append([f"{'*' if ascii_only else '•'} {entry}"]) + if not units: + units.append([f"{'*' if ascii_only else '•'} -"]) + return units + + +def build_san_tree_chunks_with_style( + san_entries: list[str], + ascii_only: bool, + max_lines_per_chunk: int = 24, +) -> list[list[str]]: + chunks: list[list[str]] = [] + current_chunk: list[str] = [] + current_lines = 0 + + def flush_current_chunk() -> None: + nonlocal current_chunk, current_lines + if current_chunk: + chunks.append(current_chunk) + current_chunk = [] + current_lines = 0 + + for unit in build_san_tree_units_with_style(san_entries, ascii_only=ascii_only): + if len(unit) > max_lines_per_chunk: + flush_current_chunk() + for start in range(0, len(unit), max_lines_per_chunk): + chunks.append(unit[start : start + max_lines_per_chunk]) + continue + if current_chunk and current_lines + len(unit) > max_lines_per_chunk: + flush_current_chunk() + current_chunk.extend(unit) + current_lines += len(unit) + + flush_current_chunk() + return chunks + + +def build_san_tree_lines_with_style(san_entries: list[str], ascii_only: bool) -> list[str]: + lines: list[str] = [] + for chunk in build_san_tree_chunks_with_style( + san_entries, + ascii_only=ascii_only, + max_lines_per_chunk=10_000, + ): + lines.extend(chunk) return lines @@ -1033,9 +1077,6 @@ def render_latex_report( r"\usepackage{fancyvrb}", r"\usepackage{needspace}", r"\defaultfontfeatures{Ligatures=TeX,Scale=MatchLowercase}", - r"\setmainfont{Palatino}", - r"\setsansfont{Avenir Next}", - r"\setmonofont{Menlo}", r"\definecolor{Ink}{HTML}{17202A}", r"\definecolor{Muted}{HTML}{667085}", r"\definecolor{Line}{HTML}{D0D5DD}", @@ -1071,7 +1112,7 @@ def render_latex_report( r" issuerpanel/.style={panel,colback=Panel,colframe=Ink!45},", r" familypanel/.style={panel,colback=AccentSoft,colframe=AccentLine},", r" subjectpanel/.style={panel,colback=white,colframe=Line},", - r" treepanel/.style={panel,colback=Panel,colframe=AccentLine},", + r" treepanel/.style={enhanced,boxrule=0.55pt,arc=3pt,left=9pt,right=9pt,top=8pt,bottom=8pt,colback=Panel,colframe=AccentLine},", r"}", r"\newcommand{\DomainChip}[1]{\tcbox[on line,boxrule=0pt,arc=3pt,left=5pt,right=5pt,top=2pt,bottom=2pt,colback=AccentSoft]{\sffamily\footnotesize\texttt{#1}}}", r"\newcommand{\MetricChip}[2]{\tcbox[on line,boxrule=0pt,arc=3pt,left=6pt,right=6pt,top=3pt,bottom=3pt,colback=Panel]{\sffamily\footnotesize\textcolor{Muted}{#1}\hspace{0.45em}\textbf{#2}}}", @@ -1225,6 +1266,11 @@ def render_latex_report( rf"\newline \textcolor{{Muted}}{{SANs: {len(hit.san_entries)} \quad crt.sh: {latex_escape(crtsh_ids)} \quad {latex_escape(one_line_revocation(hit))}}}", ] ) + tree_chunks = build_san_tree_chunks_with_style( + unique_san_entries, + ascii_only=True, + max_lines_per_chunk=24, + ) lines.extend( [ r"\end{itemize}", @@ -1240,18 +1286,35 @@ def render_latex_report( rf"\textbf{{Dominant zones}}: {latex_escape(', '.join(f'{zone} ({count})' for zone, count in san_summary['top_zones']) if san_summary['top_zones'] else 'none')}", r"\par", rf"\textbf{{Repeating host schemas}}: {latex_escape(', '.join(f'{pattern} ({count})' for pattern, count in san_summary['repeating_patterns']) if san_summary['repeating_patterns'] else 'mostly one-off SAN hostnames')}", - r"\end{tcolorbox}", - r"\begin{tcolorbox}[treepanel,title={SAN Structure}]", - r"\begin{Verbatim}[fontsize=\footnotesize]", - ] - ) - lines.extend(build_san_tree_lines_with_style(unique_san_entries, ascii_only=True)) - lines.extend( - [ - r"\end{Verbatim}", + ( + rf"\par\medskip\textcolor{{Muted}}{{The SAN structure below is shown in {len(tree_chunks)} intact panels so the visual grouping is not broken across a page.}}" + if len(tree_chunks) > 1 + else "" + ), r"\end{tcolorbox}", ] ) + for tree_chunk_index, tree_lines in enumerate(tree_chunks, start=1): + tree_title = ( + "SAN Structure" + if len(tree_chunks) == 1 + else f"SAN Structure ({tree_chunk_index}/{len(tree_chunks)})" + ) + tree_needspace = max(12, min(len(tree_lines) + 7, 32)) + lines.extend( + [ + rf"\Needspace{{{tree_needspace}\baselineskip}}", + rf"\begin{{tcolorbox}}[treepanel,title={{{latex_escape(tree_title)}}}]", + r"\begin{Verbatim}[fontsize=\footnotesize]", + ] + ) + lines.extend(tree_lines) + lines.extend( + [ + r"\end{Verbatim}", + r"\end{tcolorbox}", + ] + ) lines.extend( [ diff --git a/teachingNoobs/CURRICULUM.md b/teachingNoobs/CURRICULUM.md new file mode 100644 index 0000000..5d4d1bf --- /dev/null +++ b/teachingNoobs/CURRICULUM.md @@ -0,0 +1,44 @@ +# teachingNoobs Curriculum + +Open each file in VS Code and use Markdown Preview. The intended order is: + +1. [ct_scan.md](./ct_scan.md) + Why first: this is the core analytics engine. If you understand this file, you understand where the certificate facts come from. +2. [ct_dns_utils.md](./ct_dns_utils.md) + Why second: this explains how the DNS side was scanned and interpreted. +3. [ct_usage_assessment.md](./ct_usage_assessment.md) + Why third: this explains how certificate purpose was classified from EKU and KeyUsage. +4. [ct_lineage_report.md](./ct_lineage_report.md) + Why fourth: this adds historical time and red-flag logic. +5. [ct_caa_analysis.md](./ct_caa_analysis.md) + Why fifth: this adds the DNS-side issuance-policy layer. +6. [ct_focus_subjects.md](./ct_focus_subjects.md) + Why sixth: this explains the special hand-picked Subject-CN cohort logic. +7. [ct_master_report.md](./ct_master_report.md) + Why seventh: this shows how the current-state analytical layers are stitched into one coherent bundle. +8. [ct_monograph_report.md](./ct_monograph_report.md) + Why last: this is the publishing layer. Read it last because it is about presentation and assembly, not fact extraction. + +Suggested reading method: + +- Keep the Markdown preview open. +- For each page, read the explanation on the right first. +- Then look left at the code block and see how the explanation maps onto the exact lines. +- Do not try to memorize every helper function on first pass. Focus on the few blocks that move real data from one stage to the next. +- Pay special attention to the new `Flow arrows` panel on the right side. That panel tells you where the block's output goes next. + +What matters most: + +- In `ct_scan.py`: how raw database rows become verified leaf certificates. +- In `ct_dns_utils.py`: how raw DNS answers become delivery clues. +- In `ct_lineage_report.py`: how the code decides what is a normal renewal versus a red flag. +- In `ct_caa_analysis.py`: how live DNS policy is compared with live certificate coverage. +- In `ct_master_report.py`: how the current-state pieces are combined. + +What matters less on first read: + +- tiny formatting helpers +- string-wrapping helpers +- Markdown/LaTeX table plumbing + +Those are still useful, but they are support code, not the heart of the analytics. diff --git a/teachingNoobs/build_teaching_docs.py b/teachingNoobs/build_teaching_docs.py new file mode 100644 index 0000000..e9a1823 --- /dev/null +++ b/teachingNoobs/build_teaching_docs.py @@ -0,0 +1,451 @@ +#!/usr/bin/env python3 + +from __future__ import annotations + +import ast +import html +from pathlib import Path + + +ROOT = Path(__file__).resolve().parents[1] +OUT_DIR = ROOT / "teachingNoobs" + +SOURCE_FILES = [ + "ct_scan.py", + "ct_dns_utils.py", + "ct_usage_assessment.py", + "ct_lineage_report.py", + "ct_caa_analysis.py", + "ct_focus_subjects.py", + "ct_master_report.py", + "ct_monograph_report.py", +] + +FILE_INTROS = { + "ct_scan.py": ( + "Core Certificate Transparency scanner. This file talks to crt.sh's public " + "database, downloads the real certificate bytes, verifies that they are real " + "leaf certificates, groups them into readable families, and can render the " + "full inventory appendix." + ), + "ct_dns_utils.py": ( + "Public DNS scanner. This file runs dig, follows alias chains, finds public " + "addresses, and collapses raw DNS evidence into readable delivery labels." + ), + "ct_usage_assessment.py": ( + "Certificate-purpose analyzer. This file looks at EKU and KeyUsage to decide " + "what each certificate is technically allowed to do." + ), + "ct_lineage_report.py": ( + "Historical analyzer. This file studies expired plus current certificates to " + "find renewals, overlap, drift, and issuance bursts over time." + ), + "ct_caa_analysis.py": ( + "CAA analyzer. This file resolves live DNS issuance policy and compares it " + "against the public CA families that are actually covering the names today." + ), + "ct_focus_subjects.py": ( + "Focused-cohort analyzer. This file takes your special hand-picked Subject CN " + "list and compares it against the wider certificate and DNS estate." + ), + "ct_master_report.py": ( + "Current-state synthesizer. This file combines certificate facts, DNS facts, " + "purpose classification, grouping, and curated examples into one report bundle." + ), + "ct_monograph_report.py": ( + "Publication builder. This file takes all analytical layers and turns them into " + "the final monograph in Markdown, LaTeX, and PDF." + ), +} + +FILE_FLOW_STRIPS = { + "ct_scan.py": "domains file -> raw CT query -> parsed leaf certificates -> CN families -> issuer trust -> appendix reports", + "ct_dns_utils.py": "DNS name -> dig answers -> normalized observation -> provider hints -> delivery label", + "ct_usage_assessment.py": "certificate bytes -> EKU and KeyUsage -> purpose label -> summary counts", + "ct_lineage_report.py": "historical CT rows -> historical certificates -> grouped by Subject CN -> overlap and drift checks -> red flags", + "ct_caa_analysis.py": "DNS name -> effective CAA lookup -> allowed CA families -> compare with live cert families", + "ct_focus_subjects.py": "focus-subject file -> cohort entries -> compare against current and historical estate -> bucketed cohort explanation", + "ct_master_report.py": "current CT facts + DNS facts + usage facts -> one current-state report bundle", + "ct_monograph_report.py": "current-state bundle + history + CAA + focused cohort -> Markdown/LaTeX/PDF monograph", +} + +BLOCK_NOTES = { + "ct_scan.py": { + "__module__": "Imports, SQL, constants, and shared data shapes for the core CT scanner.", + "DatabaseRecord": "A raw row as it comes back from the crt.sh database before local cleanup.", + "CertificateHit": "The cleaned working object used by the rest of the analytics pipeline.", + "VerificationStats": "A tiny running counter that proves how many rows were kept or rejected.", + "CertificateGroup": "One readable family of related certificates after grouping logic runs.", + "ScanStats": "Top-level summary numbers used in reports.", + "IssuerTrustInfo": "Stores the public-trust picture for one issuer family.", + "connect": "Opens the direct guest PostgreSQL connection to crt.sh's certwatch backend.", + "query_domain": "Runs the main certificate query for one search term and refuses silent undercounting.", + "query_raw_match_count": "Counts how many raw hits exist before the capped query runs.", + "build_hits": "Parses certificate bytes, rejects bad objects, and merges duplicate views of the same cert.", + "build_groups": "Turns a flat certificate list into CN-based families such as exact endpoints or numbered rails.", + "query_issuer_trust": "Checks which issuers are currently trusted for public TLS in the major WebPKI contexts.", + "render_markdown_report": "Writes the raw inventory appendix as readable Markdown.", + "render_latex_report": "Writes the raw inventory appendix as LaTeX for PDF assembly.", + "compile_latex_to_pdf": "Hands LaTeX to XeLaTeX and turns it into a finished PDF file.", + "main": "The standalone command-line entrypoint for the inventory scanner.", + }, + "ct_dns_utils.py": { + "__module__": "Shared DNS scanning helpers, cache helpers, and the logic that turns raw DNS answers into platform clues.", + "DnsObservation": "One complete DNS observation for one hostname.", + "scan_name_live": "Runs the live DNS walk for one hostname.", + "scan_name_cached": "Reuses a recent DNS result if possible, otherwise performs the live scan.", + "infer_provider_hints": "Reads the raw DNS trail and pulls out likely platform or vendor clues.", + "infer_stack_signature": "Collapses several low-level DNS clues into one human-readable delivery label.", + "provider_explanations": "Supplies the glossary text used later in the reports.", + }, + "ct_usage_assessment.py": { + "__module__": "Purpose-analysis constants and small data shapes for EKU and KeyUsage classification.", + "PurposeClassification": "One certificate plus the usage label assigned to it.", + "AssessmentSummary": "The roll-up numbers that power the purpose chapter.", + "build_classifications": "Walks through all current certificates and labels them by intended usage.", + "summarize": "Compresses the per-certificate labels into counts, templates, and issuer breakdowns.", + "render_markdown": "Writes the standalone purpose report.", + "main": "The standalone command-line entrypoint for the purpose analyzer.", + }, + "ct_lineage_report.py": { + "__module__": "Historical query logic, data structures, and red-flag rules for certificate lifecycle analysis.", + "HistoricalCertificate": "One certificate in the full time-based dataset, including expired ones.", + "CnCollisionRow": "A table row for Subject-DN drift or issuer drift under the same Subject CN.", + "SanChangeRow": "A table row that describes SAN-profile change for one Subject CN.", + "OverlapRow": "A table row describing long predecessor/successor overlap.", + "RedFlagRow": "A compact summary row for names worth attention.", + "HistoricalAssessment": "The full historical analysis bundle used by the monograph.", + "query_historical_domain": "Fetches the wider historical corpus for one search term.", + "build_certificates": "Converts raw DB rows into historical working objects.", + "dn_change_rows": "Finds names whose formal Subject DN changed over time.", + "issuer_change_rows": "Finds names whose issuing CA family changed over time.", + "san_change_rows": "Finds names whose SAN bundle changed over time.", + "overlap_rows": "Finds predecessor/successor pairs that overlap too long.", + "build_assessment": "Runs the full historical workflow and returns the finished analytical bundle.", + "render_markdown": "Writes the standalone historical report in Markdown.", + "render_latex": "Writes the standalone historical report in LaTeX.", + "main": "The standalone command-line entrypoint for the historical analyzer.", + }, + "ct_caa_analysis.py": { + "__module__": "Data structures and lookup logic for effective CAA policy analysis.", + "CaaObservation": "One resolved CAA result before it is merged with certificate coverage data.", + "CaaNameRow": "One final row that compares DNS policy with current live certificate families.", + "CaaAnalysis": "The full CAA analysis bundle used by the monograph.", + "relevant_caa_live": "Finds the effective live CAA for one name, including inheritance and alias behavior.", + "build_analysis": "Runs CAA across the whole SAN namespace and compares policy with live issuance.", + "rows_for_zone": "Filters the full analysis down to one configured DNS zone.", + }, + "ct_focus_subjects.py": { + "__module__": "Rules and data shapes for analyzing the special hand-picked Subject-CN cohort.", + "FocusSubject": "One line from the local focus-subject file.", + "FocusSubjectDetail": "One detailed analytical row for one focused Subject CN.", + "FocusCohortAnalysis": "The full cohort comparison bundle used in the monograph.", + "load_focus_subjects": "Reads the local focus-subject list and any analyst notes attached to it.", + "classify_taxonomy_bucket": "Places a name into the direct-front, platform-anchor, or ambiguous bucket.", + "observed_role": "Tries to describe what role the name appears to play in the public estate.", + "build_analysis": "Runs the full comparison between the focused cohort and the rest of the estate.", + }, + "ct_master_report.py": { + "__module__": "Current-state report assembly code that sits above the low-level scanners.", + "ExampleBlock": "A small narrative evidence block used in the naming chapter.", + "load_records": "Loads current CT records for all configured search terms.", + "enrich_dns": "Adds DNS observations and provider clues to the raw SAN-name list.", + "pick_examples": "Chooses a few representative examples that make the naming and DNS story understandable.", + "build_group_digest": "Builds a compact family catalogue used in reports.", + "summarize_for_report": "Creates the big current-state dictionary consumed by the monograph builder.", + "render_markdown": "Writes the shorter consolidated report in Markdown.", + "render_latex": "Writes the shorter consolidated report in LaTeX.", + "main": "The standalone command-line entrypoint for the consolidated current-state report.", + }, + "ct_monograph_report.py": { + "__module__": "The orchestration and publishing layer that turns all analytical modules into one publication.", + "render_appendix_inventory": "Generates the hidden full inventory appendix before the main monograph is assembled.", + "append_longtable": "Shared LaTeX helper for readable multi-page tables.", + "render_markdown": "Writes the narrative monograph in Markdown.", + "render_latex": "Writes the narrative monograph in LaTeX.", + "main": "The top-level command-line entrypoint for the complete monograph build.", + }, +} + +BLOCK_FLOWS = { + "ct_scan.py": { + "Module setup": ("Nothing yet; this is the starting point.", "`connect`, `query_domain`, `build_hits`, and the report renderers use these shared definitions."), + "load_domains": ("Operator's local config file.", "`query_domain` and the higher-level loaders use this cleaned domain list."), + "connect": ("Called by query functions that need live crt.sh data.", "`query_domain`, `query_raw_match_count`, and issuer-trust lookups all depend on this connection."), + "query_raw_match_count": ("A domain string from the local config.", "`query_domain` uses this count to refuse silent undercounting."), + "query_domain": ("A domain plus the safety cap and retry settings.", "`build_hits` receives the raw records returned here."), + "build_hits": ("Raw `DatabaseRecord` rows from crt.sh.", "`build_groups`, purpose analysis, DNS analysis, and CAA analysis all consume these cleaned hits."), + "build_groups": ("The flat list of `CertificateHit` objects.", "The report builders use these groups to turn raw certificate clutter into readable families."), + "query_issuer_trust": ("The cleaned current certificate hits.", "Report builders use this trust view in the certificate chapters and appendix tables."), + "render_markdown_report": ("Current hits, groups, and trust data.", "Produces the Markdown inventory appendix."), + "render_latex_report": ("Current hits, groups, and trust data.", "Produces the LaTeX appendix source that later becomes PDF."), + "compile_latex_to_pdf": ("A finished `.tex` file.", "Produces the human-readable PDF artifact."), + "main": ("CLI arguments from the operator.", "Runs the whole scanner end to end."), + }, + "ct_dns_utils.py": { + "Module setup": ("Nothing yet; this is the starting point.", "The later DNS helpers all reuse these imports and small shared helpers."), + "run_dig": ("A hostname and record type.", "`scan_name_live`, `dig_status`, `dig_short`, and `ptr_lookup` all rely on this."), + "scan_name_live": ("One DNS name from a SAN entry.", "`scan_name_cached` returns this result shape to higher-level analytics."), + "scan_name_cached": ("A DNS name plus cache settings.", "`ct_master_report.enrich_dns` uses this for every SAN name in the current corpus."), + "infer_provider_hints": ("One normalized DNS observation.", "`infer_stack_signature` and the report layers use the hints it produces."), + "infer_stack_signature": ("One DNS observation plus provider clues.", "`ct_master_report` uses the resulting label in naming and DNS chapters."), + "provider_explanations": ("The delivery labels used by the report.", "The monograph glossary uses these explanations directly."), + }, + "ct_usage_assessment.py": { + "extract_eku_oids": ("One certificate object.", "`classify_purpose` uses these OIDs to decide the category."), + "extract_key_usage_flags": ("One certificate object.", "`build_classifications` stores these flags as supporting evidence."), + "classify_purpose": ("The EKU OID list from one certificate.", "`build_classifications` turns that decision into a per-certificate record."), + "build_classifications": ("The cleaned current hits plus raw records.", "`summarize` compresses these rows into report-level counts."), + "summarize": ("The per-certificate purpose labels.", "Current-state and monograph chapters use the summary counts and templates."), + "main": ("CLI arguments from the operator.", "Runs the standalone purpose analysis end to end."), + }, + "ct_lineage_report.py": { + "query_historical_domain": ("A configured search domain.", "`load_records` uses it to build the wider historical corpus."), + "build_certificates": ("Historical `DatabaseRecord` rows.", "`group_by_subject_cn` and all drift checks consume these normalized historical certificates."), + "group_by_subject_cn": ("Historical certificates.", "`dn_change_rows`, `issuer_change_rows`, `san_change_rows`, and `overlap_rows` all work off this grouping."), + "dn_change_rows": ("CN-grouped historical certificates.", "`build_assessment` uses these rows for Subject-DN drift sections."), + "issuer_change_rows": ("CN-grouped historical certificates.", "`build_assessment` uses these rows for CA-family drift sections."), + "san_change_rows": ("CN-grouped historical certificates.", "`build_assessment` uses these rows for SAN-drift sections."), + "overlap_rows": ("CN-grouped historical certificates.", "`build_assessment` turns these into current and past overlap red flags."), + "build_assessment": ("Historical records from all configured domains.", "The monograph and standalone historical reports consume this one big bundle."), + "main": ("CLI arguments from the operator.", "Runs the standalone historical analysis end to end."), + }, + "ct_caa_analysis.py": { + "relevant_caa_live": ("One DNS name from the SAN universe.", "`build_analysis` uses this to learn the effective issuance policy per name."), + "allowed_ca_families": ("Raw CAA rows for one effective policy.", "`build_analysis` uses the normalized families for policy-vs-live comparison."), + "build_analysis": ("Current certificate hits and the configured zones.", "The monograph uses this for the CAA chapter and appendix."), + "rows_for_zone": ("The full CAA analysis bundle.", "The monograph uses zone-filtered rows for per-zone policy tables."), + }, + "ct_focus_subjects.py": { + "load_focus_subjects": ("The local focus-subject file.", "`build_analysis` uses these parsed cohort entries."), + "classify_taxonomy_bucket": ("One focused Subject CN plus surrounding evidence.", "`build_analysis` uses the bucket label in the focused-cohort chapter."), + "observed_role": ("One focused Subject CN plus public evidence.", "`build_analysis` stores the plain-English role description."), + "build_analysis": ("The focus-subject list, current-state report, and historical assessment.", "The monograph uses the resulting bundle for Chapter 8 and Appendix D."), + }, + "ct_master_report.py": { + "load_records": ("Configured domains from the local file.", "`summarize_for_report` uses the returned CT rows as its starting point."), + "enrich_dns": ("The unique SAN DNS names from current hits.", "`summarize_for_report` uses the enriched observations for DNS chapters and examples."), + "pick_examples": ("Current hits, groups, and DNS observations.", "`summarize_for_report` stores the chosen examples for the naming chapter."), + "build_group_digest": ("Current groups plus DNS observations.", "Report builders use the digest in appendices and summary tables."), + "summarize_for_report": ("Current CT rows, DNS observations, issuer trust, and usage facts.", "`ct_monograph_report.main` consumes this as the main current-state input."), + "main": ("CLI arguments from the operator.", "Runs the shorter consolidated current-state report end to end."), + }, + "ct_monograph_report.py": { + "render_appendix_inventory": ("The current-state report bundle.", "Creates the hidden appendix files that are later embedded into the monograph."), + "render_markdown": ("Current-state facts, history, CAA, and focused-cohort analysis.", "Produces the main Markdown monograph."), + "render_latex": ("Current-state facts, history, CAA, and focused-cohort analysis.", "Produces the main LaTeX monograph source."), + "main": ("CLI arguments from the operator.", "Runs the full publication pipeline from raw analytics to finished PDF."), + }, +} + +CURRICULUM = """# teachingNoobs Curriculum + +Open each file in VS Code and use Markdown Preview. The intended order is: + +1. [ct_scan.md](./ct_scan.md) + Why first: this is the core analytics engine. If you understand this file, you understand where the certificate facts come from. +2. [ct_dns_utils.md](./ct_dns_utils.md) + Why second: this explains how the DNS side was scanned and interpreted. +3. [ct_usage_assessment.md](./ct_usage_assessment.md) + Why third: this explains how certificate purpose was classified from EKU and KeyUsage. +4. [ct_lineage_report.md](./ct_lineage_report.md) + Why fourth: this adds historical time and red-flag logic. +5. [ct_caa_analysis.md](./ct_caa_analysis.md) + Why fifth: this adds the DNS-side issuance-policy layer. +6. [ct_focus_subjects.md](./ct_focus_subjects.md) + Why sixth: this explains the special hand-picked Subject-CN cohort logic. +7. [ct_master_report.md](./ct_master_report.md) + Why seventh: this shows how the current-state analytical layers are stitched into one coherent bundle. +8. [ct_monograph_report.md](./ct_monograph_report.md) + Why last: this is the publishing layer. Read it last because it is about presentation and assembly, not fact extraction. + +Suggested reading method: + +- Keep the Markdown preview open. +- For each page, read the explanation on the right first. +- Then look left at the code block and see how the explanation maps onto the exact lines. +- Do not try to memorize every helper function on first pass. Focus on the few blocks that move real data from one stage to the next. +- Pay special attention to the new `Flow arrows` panel on the right side. That panel tells you where the block's output goes next. + +What matters most: + +- In `ct_scan.py`: how raw database rows become verified leaf certificates. +- In `ct_dns_utils.py`: how raw DNS answers become delivery clues. +- In `ct_lineage_report.py`: how the code decides what is a normal renewal versus a red flag. +- In `ct_caa_analysis.py`: how live DNS policy is compared with live certificate coverage. +- In `ct_master_report.py`: how the current-state pieces are combined. + +What matters less on first read: + +- tiny formatting helpers +- string-wrapping helpers +- Markdown/LaTeX table plumbing + +Those are still useful, but they are support code, not the heart of the analytics. +""" + + +def block_span(node: ast.AST, next_node: ast.AST | None, total_lines: int) -> tuple[int, int]: + start = min((item.lineno for item in getattr(node, "decorator_list", []) if hasattr(item, "lineno")), default=node.lineno) + end = getattr(node, "end_lineno", None) or total_lines + return start, end + + +def fallback_explanation(file_name: str, block_name: str, kind: str) -> str: + lower = block_name.lower() + if kind == "class": + return "This class is a structured container for one piece of data that later code passes around instead of juggling many loose variables." + if lower == "parse_args": + return "This block defines the command-line knobs for the file: input paths, cache settings, output paths, and other runtime switches." + if lower == "main": + return "This is the file's entrypoint. It glues the earlier helper blocks together into one end-to-end run." + if lower.startswith("load_"): + return "This block loads data from disk, cache, or an earlier stage so later code can work with it." + if lower.startswith("store_"): + return "This block saves an intermediate result so the next run can reuse it instead of recomputing everything." + if lower.startswith("query_"): + return "This block asks an external source for data and returns it in a shape the rest of the file can use." + if lower.startswith("extract_"): + return "This block pulls one specific piece of information out of a larger object." + if lower.startswith("build_"): + return "This block constructs a richer higher-level result from simpler inputs." + if lower.startswith("render_"): + return "This block turns structured analysis data into human-readable output." + if lower.startswith("classify_"): + return "This block applies rules and chooses a category label." + if lower.startswith("summarize_") or lower == "summarize": + return "This block compresses many detailed rows into a smaller, easier-to-read summary." + if lower.startswith("compile_"): + return "This block hands an intermediate artifact to an external tool so it becomes a finished output file." + if lower.startswith("group_"): + return "This block clusters related items together so later code can analyze them as families instead of as isolated rows." + if lower.startswith("normalize_") or lower.startswith("canonicalize_"): + return "This block makes values consistent so matching and grouping do not get confused by superficial differences." + if lower.startswith("pct") or lower in {"utc_iso", "truncate_text", "first_list_item"}: + return "This is a small helper that keeps the larger analytical code cleaner and easier to reuse." + return f"This {kind} is one of the building blocks inside `{file_name}`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine." + + +def explain_block(file_name: str, block_name: str, kind: str) -> str: + specific = BLOCK_NOTES.get(file_name, {}).get(block_name) + if specific: + return specific + return fallback_explanation(file_name, block_name, kind) + + +def code_panel(code: str, language: str = "python") -> str: + escaped = html.escape(code.rstrip()) + return ( + '
'
+        + escaped
+        + "
" + ) + + +def explanation_panel(title: str, text: str) -> str: + return ( + f"

{html.escape(title)}

" + f"

{html.escape(text)}

" + ) + + +def flow_panel(file_name: str, block_name: str) -> str: + upstream, downstream = BLOCK_FLOWS.get(file_name, {}).get( + block_name, + ( + "Earlier blocks or operator input feed this block.", + "Later blocks in the same file or in the next analytical stage consume its output.", + ), + ) + return ( + "

Flow arrows

" + f"

{html.escape(upstream)} → {html.escape(block_name)} → {html.escape(downstream)}

" + ) + + +def make_doc_for_file(file_name: str) -> str: + path = ROOT / file_name + source = path.read_text(encoding="utf-8") + lines = source.splitlines() + tree = ast.parse(source, filename=file_name) + top_nodes = [node for node in tree.body if isinstance(node, (ast.ClassDef, ast.FunctionDef, ast.AsyncFunctionDef))] + + blocks: list[tuple[str, str, str]] = [] + if top_nodes: + first_start = min( + (item.lineno for item in getattr(top_nodes[0], "decorator_list", []) if hasattr(item, "lineno")), + default=top_nodes[0].lineno, + ) + preamble_end = first_start - 1 + if preamble_end >= 1: + preamble_code = "\n".join(lines[:preamble_end]).rstrip() + if preamble_code: + blocks.append(("Module setup", "module", preamble_code)) + + for index, node in enumerate(top_nodes): + next_node = top_nodes[index + 1] if index + 1 < len(top_nodes) else None + start, end = block_span(node, next_node, len(lines)) + code = "\n".join(lines[start - 1 : end]).rstrip() + kind = "class" if isinstance(node, ast.ClassDef) else "function" + blocks.append((node.name, kind, code)) + + page_lines = [ + f"# {file_name}", + "", + f"Source file: [`{file_name}`](../{file_name})", + "", + FILE_INTROS[file_name], + "", + f"Main flow in one line: `{FILE_FLOW_STRIPS[file_name]}`", + "", + "How to read this page:", + "", + "- left side: the actual source code block", + "- right side: a plain-English explanation for a beginner", + "- read from top to bottom because later blocks depend on earlier ones", + "", + ] + + for title, kind, code in blocks: + explanation = explain_block(file_name, "__module__" if kind == "module" else title, kind) + page_lines.extend( + [ + f"## {title}", + "", + '', + "", + '", + '", + "", + "
', + code_panel(code), + "', + explanation_panel("What this block is doing", explanation), + flow_panel(file_name, title), + explanation_panel( + "How to think about it", + "Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?", + ), + "
", + "", + ] + ) + + return "\n".join(page_lines) + "\n" + + +def main() -> int: + OUT_DIR.mkdir(parents=True, exist_ok=True) + for file_name in SOURCE_FILES: + doc_path = OUT_DIR / file_name.replace(".py", ".md") + doc_path.write_text(make_doc_for_file(file_name), encoding="utf-8") + (OUT_DIR / "CURRICULUM.md").write_text(CURRICULUM, encoding="utf-8") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/teachingNoobs/ct_caa_analysis.md b/teachingNoobs/ct_caa_analysis.md new file mode 100644 index 0000000..d69da3e --- /dev/null +++ b/teachingNoobs/ct_caa_analysis.md @@ -0,0 +1,531 @@ +# ct_caa_analysis.py + +Source file: [`ct_caa_analysis.py`](../ct_caa_analysis.py) + +CAA analyzer. This file resolves live DNS issuance policy and compares it against the public CA families that are actually covering the names today. + +Main flow in one line: `DNS name -> effective CAA lookup -> allowed CA families -> compare with live cert families` + +How to read this page: + +- left side: the actual source code block +- right side: a plain-English explanation for a beginner +- read from top to bottom because later blocks depend on earlier ones + +## Module setup + + + + + + +
+
#!/usr/bin/env python3
+
+from __future__ import annotations
+
+from collections import Counter, defaultdict
+from dataclasses import asdict, dataclass
+from datetime import UTC, datetime
+from pathlib import Path
+from typing import Any
+
+import ct_dns_utils
+import ct_scan
+
+

What this block is doing

Data structures and lookup logic for effective CAA policy analysis.

+

Flow arrows

Earlier blocks or operator input feed this block. → Module setup → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## CaaObservation + + + + + + +
+
@dataclass
+class CaaObservation:
+    name: str
+    effective_rr_owner: str | None
+    source_kind: str
+    source_label: str | None
+    aliases_seen: list[str]
+    caa_rows: list[tuple[int, str, str]]
+
+

What this block is doing

One resolved CAA result before it is merged with certificate coverage data.

+

Flow arrows

Earlier blocks or operator input feed this block. → CaaObservation → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## CaaNameRow + + + + + + +
+
@dataclass
+class CaaNameRow:
+    name: str
+    zone: str
+    source_kind: str
+    effective_rr_owner: str | None
+    source_label: str | None
+    aliases_seen: list[str]
+    issue_values: list[str]
+    issuewild_values: list[str]
+    iodef_values: list[str]
+    allowed_ca_families: list[str]
+    current_covering_families: list[str]
+    current_covering_subject_cns: list[str]
+    current_covering_cert_count: int
+    current_multi_family_overlap: bool
+    current_policy_mismatch: bool
+    mismatch_families: list[str]
+
+

What this block is doing

One final row that compares DNS policy with current live certificate families.

+

Flow arrows

Earlier blocks or operator input feed this block. → CaaNameRow → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## CaaAnalysis + + + + + + +
+
@dataclass
+class CaaAnalysis:
+    generated_at_utc: str
+    configured_domains: list[str]
+    total_names: int
+    rows: list[CaaNameRow]
+    source_kind_counts: Counter[str]
+    zone_counts: Counter[str]
+    multi_family_overlap_names: list[str]
+    policy_mismatch_names: list[str]
+
+

What this block is doing

The full CAA analysis bundle used by the monograph.

+

Flow arrows

Earlier blocks or operator input feed this block. → CaaAnalysis → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## normalize_dns_name + + + + + + +
+
def normalize_dns_name(value: str) -> str:
+    value = value.strip()
+    if value.upper().startswith("DNS:"):
+        return ct_dns_utils.normalize_name(value[4:])
+    return ct_dns_utils.normalize_name(value)
+
+

What this block is doing

This block makes values consistent so matching and grouping do not get confused by superficial differences.

+

Flow arrows

Earlier blocks or operator input feed this block. → normalize_dns_name → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## issuer_family + + + + + + +
+
def issuer_family(names: set[str]) -> str:
+    lowered = " ".join(sorted(names)).lower()
+    if "amazon" in lowered:
+        return "Amazon"
+    if "google trust services" in lowered or "cn=we1" in lowered:
+        return "Google Trust Services"
+    if "sectigo" in lowered or "comodo" in lowered:
+        return "Sectigo/COMODO"
+    if any(token in lowered for token in ["digicert", "quovadis", "thawte", "geotrust", "rapidssl", "symantec", "verisign"]):
+        return "DigiCert/QuoVadis"
+    return "Other"
+
+

What this block is doing

This function is one of the building blocks inside `ct_caa_analysis.py`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine.

+

Flow arrows

Earlier blocks or operator input feed this block. → issuer_family → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## classify_zone + + + + + + +
+
def classify_zone(name: str, configured_domains: list[str]) -> str:
+    for domain in sorted(configured_domains, key=len, reverse=True):
+        lowered_domain = domain.lower()
+        if name == lowered_domain or name.endswith(f".{lowered_domain}"):
+            return lowered_domain
+    return "other"
+
+

What this block is doing

This block applies rules and chooses a category label.

+

Flow arrows

Earlier blocks or operator input feed this block. → classify_zone → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## cache_path + + + + + + +
+
def cache_path(cache_dir: Path, name: str) -> Path:
+    return cache_dir / ct_dns_utils.cache_key(f"caa-{name}")
+
+

What this block is doing

This function is one of the building blocks inside `ct_caa_analysis.py`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine.

+

Flow arrows

Earlier blocks or operator input feed this block. → cache_path → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## serialize_observation + + + + + + +
+
def serialize_observation(observation: CaaObservation) -> dict[str, Any]:
+    return {
+        "name": observation.name,
+        "effective_rr_owner": observation.effective_rr_owner,
+        "source_kind": observation.source_kind,
+        "source_label": observation.source_label,
+        "aliases_seen": observation.aliases_seen,
+        "caa_rows": [list(row) for row in observation.caa_rows],
+    }
+
+

What this block is doing

This function is one of the building blocks inside `ct_caa_analysis.py`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine.

+

Flow arrows

Earlier blocks or operator input feed this block. → serialize_observation → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## deserialize_observation + + + + + + +
+
def deserialize_observation(payload: dict[str, Any]) -> CaaObservation:
+    return CaaObservation(
+        name=payload["name"],
+        effective_rr_owner=payload.get("effective_rr_owner"),
+        source_kind=payload["source_kind"],
+        source_label=payload.get("source_label"),
+        aliases_seen=list(payload.get("aliases_seen", [])),
+        caa_rows=[(int(flag), str(tag), str(value)) for flag, tag, value in payload.get("caa_rows", [])],
+    )
+
+

What this block is doing

This function is one of the building blocks inside `ct_caa_analysis.py`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine.

+

Flow arrows

Earlier blocks or operator input feed this block. → deserialize_observation → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## parse_caa_response + + + + + + +
+
def parse_caa_response(lines: list[str]) -> tuple[list[tuple[int, str, str]], list[str]]:
+    rows: list[tuple[int, str, str]] = []
+    aliases: list[str] = []
+    for line in lines:
+        parts = line.split(maxsplit=2)
+        if len(parts) == 3 and parts[0].isdigit():
+            flag, tag, value = parts
+            rows.append((int(flag), tag.lower(), value.strip().strip('"').lower()))
+        elif line.endswith("."):
+            aliases.append(ct_dns_utils.normalize_name(line))
+    return rows, aliases
+
+

What this block is doing

This function is one of the building blocks inside `ct_caa_analysis.py`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine.

+

Flow arrows

Earlier blocks or operator input feed this block. → parse_caa_response → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## query_caa_lines + + + + + + +
+
def query_caa_lines(name: str) -> list[str]:
+    output = ct_dns_utils.run_dig(name, "CAA", short=True)
+    return [line.strip() for line in output.splitlines() if line.strip()]
+
+

What this block is doing

This block asks an external source for data and returns it in a shape the rest of the file can use.

+

Flow arrows

Earlier blocks or operator input feed this block. → query_caa_lines → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## relevant_caa_live + + + + + + +
+
def relevant_caa_live(name: str) -> CaaObservation:
+    labels = name.rstrip(".").lower().split(".")
+    for index in range(len(labels)):
+        candidate = ".".join(labels[index:])
+        rows, aliases = parse_caa_response(query_caa_lines(candidate))
+        if rows:
+            if index == 0:
+                source_kind = "alias_target" if aliases else "exact"
+            else:
+                source_kind = "parent_alias_target" if aliases else "parent"
+            return CaaObservation(
+                name=name,
+                effective_rr_owner=candidate,
+                source_kind=source_kind,
+                source_label=aliases[-1] if aliases else candidate,
+                aliases_seen=aliases,
+                caa_rows=rows,
+            )
+    return CaaObservation(
+        name=name,
+        effective_rr_owner=None,
+        source_kind="none",
+        source_label=None,
+        aliases_seen=[],
+        caa_rows=[],
+    )
+
+

What this block is doing

Finds the effective live CAA for one name, including inheritance and alias behavior.

+

Flow arrows

One DNS name from the SAN universe. → relevant_caa_live → `build_analysis` uses this to learn the effective issuance policy per name.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## scan_name_cached + + + + + + +
+
def scan_name_cached(name: str, cache_dir: Path, ttl_seconds: int) -> CaaObservation:
+    key = cache_path(cache_dir, name).name
+    cached = ct_dns_utils.load_json_cache(cache_dir, key, ttl_seconds)
+    if cached is not None:
+        cached.pop("cached_at", None)
+        return deserialize_observation(cached)
+    observation = relevant_caa_live(name)
+    ct_dns_utils.store_json_cache(cache_dir, key, serialize_observation(observation))
+    return observation
+
+

What this block is doing

This function is one of the building blocks inside `ct_caa_analysis.py`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine.

+

Flow arrows

Earlier blocks or operator input feed this block. → scan_name_cached → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## allowed_ca_families + + + + + + +
+
def allowed_ca_families(caa_rows: list[tuple[int, str, str]]) -> list[str]:
+    families: set[str] = set()
+    for _flag, tag, value in caa_rows:
+        if tag != "issue":
+            continue
+        normalized = value[:-1] if value.endswith(".") else value
+        if any(token in normalized for token in ["amazon.com", "amazontrust.com", "awstrust.com", "amazonaws.com", "aws.amazon.com"]):
+            families.add("Amazon")
+        if any(token in normalized for token in ["sectigo.com", "comodoca.com", "comodo.com"]):
+            families.add("Sectigo/COMODO")
+        if any(token in normalized for token in ["digicert.com", "digicert.ne.jp", "thawte.com", "geotrust.com", "rapidssl.com", "symantec.com", "quovadisglobal.com", "digitalcertvalidation.com"]):
+            families.add("DigiCert/QuoVadis")
+        if "pki.goog" in normalized:
+            families.add("Google Trust Services")
+        if "letsencrypt.org" in normalized:
+            families.add("Let's Encrypt")
+        if any(token in normalized for token in ["telia.com", "telia.fi", "telia.se"]):
+            families.add("Telia")
+    return sorted(families)
+
+

What this block is doing

This function is one of the building blocks inside `ct_caa_analysis.py`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine.

+

Flow arrows

Raw CAA rows for one effective policy. → allowed_ca_families → `build_analysis` uses the normalized families for policy-vs-live comparison.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## issue_values + + + + + + +
+
def issue_values(caa_rows: list[tuple[int, str, str]], tag: str) -> list[str]:
+    return sorted({value for _flag, row_tag, value in caa_rows if row_tag == tag})
+
+

What this block is doing

This function is one of the building blocks inside `ct_caa_analysis.py`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine.

+

Flow arrows

Earlier blocks or operator input feed this block. → issue_values → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## build_analysis + + + + + + +
+
def build_analysis(
+    hits: list[ct_scan.CertificateHit],
+    configured_domains: list[str],
+    cache_dir: Path,
+    ttl_seconds: int,
+) -> CaaAnalysis:
+    names = sorted(
+        {
+            normalize_dns_name(entry)
+            for hit in hits
+            for entry in hit.san_entries
+            if normalize_dns_name(entry)
+        }
+    )
+    coverage: dict[str, list[tuple[str, str]]] = defaultdict(list)
+    for hit in hits:
+        family = issuer_family(hit.issuer_names)
+        subject_cn = normalize_dns_name(hit.subject_cn)
+        for entry in hit.san_entries:
+            coverage[normalize_dns_name(entry)].append((subject_cn, family))
+
+    rows: list[CaaNameRow] = []
+    for name in names:
+        observation = scan_name_cached(name, cache_dir, ttl_seconds)
+        allowed_families = allowed_ca_families(observation.caa_rows)
+        current_families = sorted({family for _subject, family in coverage[name]})
+        mismatch_families = sorted(family for family in current_families if allowed_families and family not in allowed_families)
+        rows.append(
+            CaaNameRow(
+                name=name,
+                zone=classify_zone(name, configured_domains),
+                source_kind=observation.source_kind,
+                effective_rr_owner=observation.effective_rr_owner,
+                source_label=observation.source_label,
+                aliases_seen=observation.aliases_seen,
+                issue_values=issue_values(observation.caa_rows, "issue"),
+                issuewild_values=issue_values(observation.caa_rows, "issuewild"),
+                iodef_values=issue_values(observation.caa_rows, "iodef"),
+                allowed_ca_families=allowed_families,
+                current_covering_families=current_families,
+                current_covering_subject_cns=sorted({subject for subject, _family in coverage[name]}),
+                current_covering_cert_count=len(coverage[name]),
+                current_multi_family_overlap=len(current_families) > 1,
+                current_policy_mismatch=bool(mismatch_families),
+                mismatch_families=mismatch_families,
+            )
+        )
+
+    return CaaAnalysis(
+        generated_at_utc=ct_scan.utc_iso(datetime.now(UTC)),
+        configured_domains=sorted(configured_domains),
+        total_names=len(rows),
+        rows=rows,
+        source_kind_counts=Counter(row.source_kind for row in rows),
+        zone_counts=Counter(row.zone for row in rows),
+        multi_family_overlap_names=sorted(row.name for row in rows if row.current_multi_family_overlap),
+        policy_mismatch_names=sorted(row.name for row in rows if row.current_policy_mismatch),
+    )
+
+

What this block is doing

Runs CAA across the whole SAN namespace and compares policy with live issuance.

+

Flow arrows

Current certificate hits and the configured zones. → build_analysis → The monograph uses this for the CAA chapter and appendix.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## rows_for_zone + + + + + + +
+
def rows_for_zone(analysis: CaaAnalysis, zone: str) -> list[CaaNameRow]:
+    return [row for row in analysis.rows if row.zone == zone]
+
+

What this block is doing

Filters the full analysis down to one configured DNS zone.

+

Flow arrows

The full CAA analysis bundle. → rows_for_zone → The monograph uses zone-filtered rows for per-zone policy tables.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## policy_counter + + + + + + +
+
def policy_counter(rows: list[CaaNameRow]) -> Counter[tuple[str, ...]]:
+    counter: Counter[tuple[str, ...]] = Counter()
+    for row in rows:
+        key = tuple(row.allowed_ca_families) if row.allowed_ca_families else ("UNRESTRICTED",)
+        counter[key] += 1
+    return counter
+
+

What this block is doing

This function is one of the building blocks inside `ct_caa_analysis.py`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine.

+

Flow arrows

Earlier blocks or operator input feed this block. → policy_counter → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## serialize_analysis + + + + + + +
+
def serialize_analysis(analysis: CaaAnalysis) -> dict[str, Any]:
+    return {
+        "generated_at_utc": analysis.generated_at_utc,
+        "configured_domains": analysis.configured_domains,
+        "total_names": analysis.total_names,
+        "rows": [asdict(row) for row in analysis.rows],
+        "source_kind_counts": dict(analysis.source_kind_counts),
+        "zone_counts": dict(analysis.zone_counts),
+        "multi_family_overlap_names": analysis.multi_family_overlap_names,
+        "policy_mismatch_names": analysis.policy_mismatch_names,
+    }
+
+

What this block is doing

This function is one of the building blocks inside `ct_caa_analysis.py`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine.

+

Flow arrows

Earlier blocks or operator input feed this block. → serialize_analysis → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ diff --git a/teachingNoobs/ct_dns_utils.md b/teachingNoobs/ct_dns_utils.md new file mode 100644 index 0000000..4682807 --- /dev/null +++ b/teachingNoobs/ct_dns_utils.md @@ -0,0 +1,501 @@ +# ct_dns_utils.py + +Source file: [`ct_dns_utils.py`](../ct_dns_utils.py) + +Public DNS scanner. This file runs dig, follows alias chains, finds public addresses, and collapses raw DNS evidence into readable delivery labels. + +Main flow in one line: `DNS name -> dig answers -> normalized observation -> provider hints -> delivery label` + +How to read this page: + +- left side: the actual source code block +- right side: a plain-English explanation for a beginner +- read from top to bottom because later blocks depend on earlier ones + +## Module setup + + + + + + +
+
#!/usr/bin/env python3
+
+from __future__ import annotations
+
+import hashlib
+import ipaddress
+import json
+import re
+import subprocess
+import time
+from dataclasses import asdict, dataclass
+from datetime import UTC, datetime
+from pathlib import Path
+from typing import Any
+
+import ct_scan
+
+

What this block is doing

Shared DNS scanning helpers, cache helpers, and the logic that turns raw DNS answers into platform clues.

+

Flow arrows

Nothing yet; this is the starting point. → Module setup → The later DNS helpers all reuse these imports and small shared helpers.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## DnsObservation + + + + + + +
+
@dataclass
+class DnsObservation:
+    original_name: str
+    original_status: str
+    cname_chain: list[str]
+    terminal_name: str
+    terminal_status: str
+    a_records: list[str]
+    aaaa_records: list[str]
+    ptr_records: list[str]
+    classification: str
+    stack_signature: str
+    provider_hints: list[str]
+
+

What this block is doing

One complete DNS observation for one hostname.

+

Flow arrows

Earlier blocks or operator input feed this block. → DnsObservation → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## normalize_name + + + + + + +
+
def normalize_name(name: str) -> str:
+    return name.rstrip(".").lower()
+
+

What this block is doing

This block makes values consistent so matching and grouping do not get confused by superficial differences.

+

Flow arrows

Earlier blocks or operator input feed this block. → normalize_name → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## cache_key + + + + + + +
+
def cache_key(value: str) -> str:
+    digest = hashlib.sha256(value.encode("utf-8")).hexdigest()[:16]
+    slug = re.sub(r"[^a-z0-9.-]+", "-", value.lower()).strip("-")
+    slug = slug[:80] or "item"
+    return f"v1-{slug}-{digest}.json"
+
+

What this block is doing

This function is one of the building blocks inside `ct_dns_utils.py`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine.

+

Flow arrows

Earlier blocks or operator input feed this block. → cache_key → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## load_json_cache + + + + + + +
+
def load_json_cache(cache_dir: Path, key: str, ttl_seconds: int) -> dict[str, Any] | None:
+    path = cache_dir / key
+    if not path.exists():
+        return None
+    payload = json.loads(path.read_text(encoding="utf-8"))
+    cached_at = datetime.fromisoformat(payload["cached_at"].replace("Z", "+00:00"))
+    age = time.time() - cached_at.astimezone(UTC).timestamp()
+    if age > ttl_seconds:
+        return None
+    return payload
+
+

What this block is doing

This block loads data from disk, cache, or an earlier stage so later code can work with it.

+

Flow arrows

Earlier blocks or operator input feed this block. → load_json_cache → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## store_json_cache + + + + + + +
+
def store_json_cache(cache_dir: Path, key: str, payload: dict[str, Any]) -> None:
+    cache_dir.mkdir(parents=True, exist_ok=True)
+    enriched = dict(payload)
+    enriched["cached_at"] = ct_scan.utc_iso(datetime.now(UTC))
+    (cache_dir / key).write_text(json.dumps(enriched, indent=2, sort_keys=True), encoding="utf-8")
+
+

What this block is doing

This block saves an intermediate result so the next run can reuse it instead of recomputing everything.

+

Flow arrows

Earlier blocks or operator input feed this block. → store_json_cache → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## run_dig + + + + + + +
+
def run_dig(name: str, rrtype: str, short: bool) -> str:
+    cmd = ["dig", "+time=2", "+tries=1"]
+    if short:
+        cmd.append("+short")
+    else:
+        cmd.extend(["+noall", "+comments", "+answer"])
+    cmd.extend([name, rrtype])
+    result = subprocess.run(cmd, capture_output=True, text=True, check=False)
+    return result.stdout
+
+

What this block is doing

This function is one of the building blocks inside `ct_dns_utils.py`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine.

+

Flow arrows

A hostname and record type. → run_dig → `scan_name_live`, `dig_status`, `dig_short`, and `ptr_lookup` all rely on this.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## dig_status + + + + + + +
+
def dig_status(name: str, rrtype: str = "A") -> str:
+    output = run_dig(name, rrtype, short=False)
+    match = re.search(r"status:\s*([A-Z]+)", output)
+    if match:
+        return match.group(1)
+    if output.strip():
+        return "NOERROR"
+    return "UNKNOWN"
+
+

What this block is doing

This function is one of the building blocks inside `ct_dns_utils.py`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine.

+

Flow arrows

Earlier blocks or operator input feed this block. → dig_status → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## dig_short + + + + + + +
+
def dig_short(name: str, rrtype: str) -> list[str]:
+    output = run_dig(name, rrtype, short=True)
+    return [normalize_name(line) for line in output.splitlines() if line.strip()]
+
+

What this block is doing

This function is one of the building blocks inside `ct_dns_utils.py`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine.

+

Flow arrows

Earlier blocks or operator input feed this block. → dig_short → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## parse_answer_section + + + + + + +
+
def parse_answer_section(output: str) -> list[tuple[str, str]]:
+    in_answer = False
+    parsed: list[tuple[str, str]] = []
+    for raw_line in output.splitlines():
+        line = raw_line.strip()
+        if not line:
+            continue
+        if line.startswith(";; ANSWER SECTION:"):
+            in_answer = True
+            continue
+        if not in_answer or line.startswith(";;"):
+            continue
+        match = re.match(r"^\S+\s+\d+\s+IN\s+(\S+)\s+(.+)$", line)
+        if not match:
+            continue
+        rrtype, rdata = match.groups()
+        parsed.append((rrtype.upper(), normalize_name(rdata)))
+    return parsed
+
+

What this block is doing

This function is one of the building blocks inside `ct_dns_utils.py`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine.

+

Flow arrows

Earlier blocks or operator input feed this block. → parse_answer_section → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## is_ip_address + + + + + + +
+
def is_ip_address(value: str) -> bool:
+    try:
+        ipaddress.ip_address(value)
+        return True
+    except ValueError:
+        return False
+
+

What this block is doing

This function is one of the building blocks inside `ct_dns_utils.py`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine.

+

Flow arrows

Earlier blocks or operator input feed this block. → is_ip_address → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## classify_observation + + + + + + +
+
def classify_observation(chain: list[str], terminal_status: str, a_records: list[str], aaaa_records: list[str]) -> str:
+    has_addresses = bool(a_records or aaaa_records)
+    if chain and has_addresses:
+        return "cname_to_address"
+    if chain and not has_addresses:
+        return "dangling_cname"
+    if has_addresses:
+        return "direct_address"
+    if terminal_status == "NXDOMAIN":
+        return "nxdomain"
+    if terminal_status == "NOERROR":
+        return "no_data"
+    return "other"
+
+

What this block is doing

This block applies rules and chooses a category label.

+

Flow arrows

Earlier blocks or operator input feed this block. → classify_observation → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## infer_provider_hints + + + + + + +
+
def infer_provider_hints(observation: DnsObservation) -> list[str]:
+    text = " ".join(
+        [
+            observation.original_name,
+            *observation.cname_chain,
+            observation.terminal_name,
+            *observation.ptr_records,
+        ]
+    ).lower()
+    hints: list[str] = []
+    if "campaign.adobe.com" in text:
+        hints.append("Adobe Campaign")
+    if "cloudfront.net" in text:
+        hints.append("AWS CloudFront")
+    if "elb.amazonaws.com" in text or "compute.amazonaws.com" in text:
+        hints.append("AWS")
+    if "apigee.net" in text or "googleusercontent.com" in text:
+        hints.append("Google Apigee")
+    if "pegacloud.net" in text or ".pega.net" in text:
+        hints.append("Pega Cloud")
+    if "useinfinite.io" in text:
+        hints.append("Infinite / agency alias")
+    if any(ip.startswith("13.107.") for ip in observation.a_records) or any(ip.startswith("2620:1ec:") for ip in observation.aaaa_records):
+        hints.append("Microsoft Edge")
+    if not hints:
+        hints.append("Unclassified")
+    return hints
+
+

What this block is doing

Reads the raw DNS trail and pulls out likely platform or vendor clues.

+

Flow arrows

One normalized DNS observation. → infer_provider_hints → `infer_stack_signature` and the report layers use the hints it produces.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## infer_stack_signature + + + + + + +
+
def infer_stack_signature(observation: DnsObservation) -> str:
+    hints = infer_provider_hints(observation)
+    if observation.classification == "nxdomain":
+        return "No public DNS (NXDOMAIN)"
+    if observation.classification == "no_data":
+        return "No public address data"
+    if "Adobe Campaign" in hints and "AWS CloudFront" in hints:
+        return "Adobe Campaign -> AWS CloudFront"
+    if "Adobe Campaign" in hints and "AWS" in hints:
+        return "Adobe Campaign -> AWS ALB"
+    if "Adobe Campaign" in hints and observation.a_records:
+        return "Adobe Campaign direct IP"
+    if "AWS CloudFront" in hints:
+        return "AWS CloudFront"
+    if "Google Apigee" in hints:
+        return "Google Apigee"
+    if "Pega Cloud" in hints and "AWS" in hints:
+        return "Pega Cloud -> AWS ALB"
+    if "Infinite / agency alias" in hints and observation.classification == "dangling_cname":
+        return "Dangling agency alias"
+    if "Microsoft Edge" in hints:
+        return "Direct Microsoft edge"
+    if "AWS" in hints:
+        return "Direct AWS"
+    if observation.classification == "direct_address":
+        return "Direct address (provider unclear)"
+    if observation.classification == "cname_to_address":
+        return "CNAME to address (provider unclear)"
+    return hints[0]
+
+

What this block is doing

Collapses several low-level DNS clues into one human-readable delivery label.

+

Flow arrows

One DNS observation plus provider clues. → infer_stack_signature → `ct_master_report` uses the resulting label in naming and DNS chapters.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## scan_name_live + + + + + + +
+
def scan_name_live(name: str) -> DnsObservation:
+    name = normalize_name(name)
+    a_output = run_dig(name, "A", short=False)
+    aaaa_output = run_dig(name, "AAAA", short=False)
+    original_status = dig_status(name, "A")
+    a_answers = parse_answer_section(a_output)
+    aaaa_answers = parse_answer_section(aaaa_output)
+    chain: list[str] = []
+    for rrtype, rdata in a_answers + aaaa_answers:
+        if rrtype == "CNAME" and rdata not in chain:
+            chain.append(rdata)
+    a_records = sorted({rdata for rrtype, rdata in a_answers if rrtype == "A" and is_ip_address(rdata)})
+    aaaa_records = sorted({rdata for rrtype, rdata in aaaa_answers if rrtype == "AAAA" and is_ip_address(rdata)})
+    terminal_name = chain[-1] if chain else name
+    terminal_status = original_status
+    observation = DnsObservation(
+        original_name=name,
+        original_status=original_status,
+        cname_chain=chain,
+        terminal_name=terminal_name,
+        terminal_status=terminal_status,
+        a_records=a_records,
+        aaaa_records=aaaa_records,
+        ptr_records=[],
+        classification=classify_observation(chain, terminal_status, a_records, aaaa_records),
+        stack_signature="",
+        provider_hints=[],
+    )
+    observation.provider_hints = infer_provider_hints(observation)
+    observation.stack_signature = infer_stack_signature(observation)
+    return observation
+
+

What this block is doing

Runs the live DNS walk for one hostname.

+

Flow arrows

One DNS name from a SAN entry. → scan_name_live → `scan_name_cached` returns this result shape to higher-level analytics.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## scan_name_cached + + + + + + +
+
def scan_name_cached(name: str, cache_dir: Path, ttl_seconds: int) -> DnsObservation:
+    key = cache_key(name)
+    cached = load_json_cache(cache_dir, key, ttl_seconds)
+    if cached is not None:
+        payload = dict(cached)
+        payload.pop("cached_at", None)
+        return DnsObservation(**payload)
+    observation = scan_name_live(name)
+    store_json_cache(cache_dir, key, asdict(observation))
+    return observation
+
+

What this block is doing

Reuses a recent DNS result if possible, otherwise performs the live scan.

+

Flow arrows

A DNS name plus cache settings. → scan_name_cached → `ct_master_report.enrich_dns` uses this for every SAN name in the current corpus.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## ptr_lookup + + + + + + +
+
def ptr_lookup(ip: str, cache_dir: Path, ttl_seconds: int) -> list[str]:
+    key = cache_key(f"ptr-{ip}")
+    cached = load_json_cache(cache_dir, key, ttl_seconds)
+    if cached is not None:
+        return list(cached.get("answers", []))
+    output = subprocess.run(
+        ["dig", "+time=2", "+tries=1", "+short", "-x", ip, "PTR"],
+        capture_output=True,
+        text=True,
+        check=False,
+    ).stdout
+    answers = [normalize_name(line) for line in output.splitlines() if line.strip()]
+    store_json_cache(cache_dir, key, {"answers": answers})
+    return answers
+
+

What this block is doing

This function is one of the building blocks inside `ct_dns_utils.py`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine.

+

Flow arrows

Earlier blocks or operator input feed this block. → ptr_lookup → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## provider_explanations + + + + + + +
+
def provider_explanations() -> dict[str, str]:
+    return {
+        "Adobe Campaign": "A marketing and communication platform often used to send customer messages, email journeys, and campaign traffic. In DNS terms, it can sit in front of cloud infrastructure rather than hosting the final application by itself.",
+        "AWS": "Amazon Web Services, a large public cloud platform. In this report it usually means the endpoint ultimately lands on Amazon-hosted compute or load-balancing infrastructure.",
+        "AWS ALB": "AWS Application Load Balancer. A traffic-distribution front door that sends incoming web requests to one or more backend services.",
+        "AWS CloudFront": "Amazon's global content-delivery and edge network. It is often used to front websites, APIs, and static assets close to users.",
+        "Google Apigee": "An API gateway and API-management layer. If a hostname lands here, it usually means the public endpoint is being governed as an API product rather than being exposed directly from an application server.",
+        "Pega Cloud": "A managed hosting platform for Pega applications and workflow systems. It often fronts case-management or process-heavy applications.",
+        "Microsoft Edge": "Microsoft-operated edge infrastructure. In DNS this usually means the public name lands on Microsoft's front-door network rather than directly on a private application host.",
+        "Infinite / agency alias": "A third-party aliasing pattern typically used by an agency or service intermediary. It points traffic onward to the actual delivery platform.",
+        "CNAME": "A DNS alias record. It says one hostname is really another hostname, rather than directly mapping to an IP address.",
+        "A record": "A DNS record that maps a hostname to an IPv4 address.",
+        "AAAA record": "A DNS record that maps a hostname to an IPv6 address.",
+        "PTR record": "A reverse-DNS record. It maps an IP address back to a hostname and is useful as a provider clue, not as proof of ownership.",
+        "NXDOMAIN": "A DNS response meaning the name does not exist publicly.",
+    }
+
+

What this block is doing

Supplies the glossary text used later in the reports.

+

Flow arrows

The delivery labels used by the report. → provider_explanations → The monograph glossary uses these explanations directly.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ diff --git a/teachingNoobs/ct_focus_subjects.md b/teachingNoobs/ct_focus_subjects.md new file mode 100644 index 0000000..7c89fff --- /dev/null +++ b/teachingNoobs/ct_focus_subjects.md @@ -0,0 +1,960 @@ +# ct_focus_subjects.py + +Source file: [`ct_focus_subjects.py`](../ct_focus_subjects.py) + +Focused-cohort analyzer. This file takes your special hand-picked Subject CN list and compares it against the wider certificate and DNS estate. + +Main flow in one line: `focus-subject file -> cohort entries -> compare against current and historical estate -> bucketed cohort explanation` + +How to read this page: + +- left side: the actual source code block +- right side: a plain-English explanation for a beginner +- read from top to bottom because later blocks depend on earlier ones + +## Module setup + + + + + + +
+
#!/usr/bin/env python3
+
+from __future__ import annotations
+
+import re
+from collections import Counter
+from dataclasses import dataclass
+from pathlib import Path
+from statistics import median
+
+import ct_dns_utils
+import ct_lineage_report
+import ct_master_report
+import ct_scan
+
+
+ENVIRONMENT_HINTS = {
+    "alpha",
+    "beta",
+    "dev",
+    "qa",
+    "uat",
+    "sit",
+    "stage",
+    "stg",
+    "preprod",
+    "prod",
+    "release",
+    "squads",
+    "sandbox",
+}
+
+VENDOR_HINTS = {
+    "vendor",
+    "external",
+    "hoster",
+    "product",
+    "mitek",
+    "scrive",
+    "pega",
+}
+
+IDENTITY_HINTS = {
+    "id",
+    "idp",
+    "identity",
+    "auth",
+    "sso",
+    "online",
+    "mail",
+    "email",
+    "secmail",
+    "chat",
+    "appointment",
+    "appointments",
+}
+
+CUSTOMER_HINTS = {
+    "brand",
+    "branding",
+    "campaign",
+    "experience",
+    "welcome",
+    "thankyou",
+    "gifts",
+    "investment",
+    "client",
+    "customers",
+    "information",
+    "club",
+    "risk",
+}
+
+

What this block is doing

Rules and data shapes for analyzing the special hand-picked Subject-CN cohort.

+

Flow arrows

Earlier blocks or operator input feed this block. → Module setup → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## FocusSubject + + + + + + +
+
@dataclass
+class FocusSubject:
+    subject_cn: str
+    analyst_note: str
+
+

What this block is doing

One line from the local focus-subject file.

+

Flow arrows

Earlier blocks or operator input feed this block. → FocusSubject → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## FocusSubjectDetail + + + + + + +
+
@dataclass
+class FocusSubjectDetail:
+    subject_cn: str
+    analyst_note: str
+    analyst_theme: str
+    taxonomy_bucket: str
+    taxonomy_reason: str
+    observed_role: str
+    basket_status: str
+    current_direct_certificates: int
+    historical_direct_certificates: int
+    current_non_focus_san_carriers: int
+    historical_non_focus_san_carriers: int
+    current_revoked_certificates: int
+    current_not_revoked_certificates: int
+    current_dns_outcome: str
+    current_dns_classification: str
+    current_issuer_families: str
+    historical_issuer_families: str
+    current_san_size_span: str
+    historical_san_size_span: str
+    max_direct_to_carrier_overlap_days: int
+    carrier_subjects: str
+    current_red_flags: str
+    past_red_flags: str
+
+

What this block is doing

One detailed analytical row for one focused Subject CN.

+

Flow arrows

Earlier blocks or operator input feed this block. → FocusSubjectDetail → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## FocusCohortAnalysis + + + + + + +
+
@dataclass
+class FocusCohortAnalysis:
+    focus_subjects: list[FocusSubject]
+    details: list[FocusSubjectDetail]
+    provided_subjects_count: int
+    historically_seen_subjects_count: int
+    current_direct_subjects_count: int
+    current_carried_only_subjects_count: int
+    historical_non_focus_carried_subjects_count: int
+    unseen_subjects: list[str]
+    current_focus_certificate_count: int
+    current_rest_certificate_count: int
+    focus_revoked_current_count: int
+    focus_not_revoked_current_count: int
+    rest_revoked_current_count: int
+    rest_not_revoked_current_count: int
+    focus_revoked_share: str
+    rest_revoked_share: str
+    focus_median_san_entries: int
+    focus_average_san_entries: str
+    rest_median_san_entries: int
+    rest_average_san_entries: str
+    focus_multi_zone_certificate_count: int
+    rest_multi_zone_certificate_count: int
+    focus_current_subject_dns_classes: Counter[str]
+    rest_current_subject_dns_classes: Counter[str]
+    focus_current_subject_dns_stacks: Counter[str]
+    rest_current_subject_dns_stacks: Counter[str]
+    focus_current_issuer_families: Counter[str]
+    rest_current_issuer_families: Counter[str]
+    focus_current_red_flag_subjects: int
+    focus_past_red_flag_subjects: int
+    focus_any_red_flag_subjects: int
+    bucket_counts: Counter[str]
+    notables: list[FocusSubjectDetail]
+    transition_rows: list[FocusSubjectDetail]
+
+

What this block is doing

The full cohort comparison bundle used in the monograph.

+

Flow arrows

Earlier blocks or operator input feed this block. → FocusCohortAnalysis → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## load_focus_subjects + + + + + + +
+
def load_focus_subjects(path: Path) -> list[FocusSubject]:
+    if not path.exists():
+        return []
+    subjects: list[FocusSubject] = []
+    seen: set[str] = set()
+    for raw_line in path.read_text(encoding="utf-8").splitlines():
+        line = raw_line.strip()
+        if not line or line.startswith("#"):
+            continue
+        match = re.match(r"^(?P<cn>[^()]+?)(?:\s*\((?P<meta>.*)\))?$", line)
+        if not match:
+            continue
+        subject_cn = match.group("cn").strip().lower()
+        if subject_cn in seen:
+            continue
+        seen.add(subject_cn)
+        subjects.append(
+            FocusSubject(
+                subject_cn=subject_cn,
+                analyst_note=(match.group("meta") or "").strip(),
+            )
+        )
+    return subjects
+
+

What this block is doing

Reads the local focus-subject list and any analyst notes attached to it.

+

Flow arrows

The local focus-subject file. → load_focus_subjects → `build_analysis` uses these parsed cohort entries.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## dns_names + + + + + + +
+
def dns_names(san_entries: list[str]) -> set[str]:
+    return {entry[4:].lower() for entry in san_entries if entry.startswith("DNS:")}
+
+

What this block is doing

This function is one of the building blocks inside `ct_focus_subjects.py`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine.

+

Flow arrows

Earlier blocks or operator input feed this block. → dns_names → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## overlap_days + + + + + + +
+
def overlap_days(
+    left_start,
+    left_end,
+    right_start,
+    right_end,
+) -> int:
+    start = max(left_start, right_start)
+    end = min(left_end, right_end)
+    if end <= start:
+        return 0
+    return max(1, (end - start).days)
+
+

What this block is doing

This function is one of the building blocks inside `ct_focus_subjects.py`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine.

+

Flow arrows

Earlier blocks or operator input feed this block. → overlap_days → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## pct + + + + + + +
+
def pct(count: int, total: int) -> str:
+    if total <= 0:
+        return "0.0%"
+    return f"{(count / total) * 100:.1f}%"
+
+

What this block is doing

This is a small helper that keeps the larger analytical code cleaner and easier to reuse.

+

Flow arrows

Earlier blocks or operator input feed this block. → pct → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## short_issuer_family + + + + + + +
+
def short_issuer_family(issuer_name: str) -> str:
+    lowered = issuer_name.lower()
+    if "amazon" in lowered:
+        return "Amazon"
+    if "sectigo" in lowered or "comodo" in lowered:
+        return "Sectigo/COMODO"
+    if "google trust services" in lowered or "cn=we1" in lowered:
+        return "Google Trust Services"
+    return "Other"
+
+

What this block is doing

This function is one of the building blocks inside `ct_focus_subjects.py`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine.

+

Flow arrows

Earlier blocks or operator input feed this block. → short_issuer_family → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## median_int + + + + + + +
+
def median_int(values: list[int]) -> int:
+    if not values:
+        return 0
+    return int(median(values))
+
+

What this block is doing

This function is one of the building blocks inside `ct_focus_subjects.py`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine.

+

Flow arrows

Earlier blocks or operator input feed this block. → median_int → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## average_text + + + + + + +
+
def average_text(values: list[int]) -> str:
+    if not values:
+        return "0.0"
+    return f"{(sum(values) / len(values)):.1f}"
+
+

What this block is doing

This function is one of the building blocks inside `ct_focus_subjects.py`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine.

+

Flow arrows

Earlier blocks or operator input feed this block. → average_text → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## san_size_span + + + + + + +
+
def san_size_span(current_hits: list[ct_scan.CertificateHit]) -> str:
+    sizes = sorted({len(hit.san_entries) for hit in current_hits})
+    if not sizes:
+        return "-"
+    if len(sizes) == 1:
+        return str(sizes[0])
+    return ", ".join(str(value) for value in sizes[:4]) + ("" if len(sizes) <= 4 else f", ... (+{len(sizes) - 4} more)")
+
+

What this block is doing

This function is one of the building blocks inside `ct_focus_subjects.py`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine.

+

Flow arrows

Earlier blocks or operator input feed this block. → san_size_span → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## historical_san_size_span + + + + + + +
+
def historical_san_size_span(certificates: list[ct_lineage_report.HistoricalCertificate]) -> str:
+    sizes = sorted({len(certificate.san_entries) for certificate in certificates})
+    if not sizes:
+        return "-"
+    if len(sizes) == 1:
+        return str(sizes[0])
+    return ", ".join(str(value) for value in sizes[:4]) + ("" if len(sizes) <= 4 else f", ... (+{len(sizes) - 4} more)")
+
+

What this block is doing

This function is one of the building blocks inside `ct_focus_subjects.py`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine.

+

Flow arrows

Earlier blocks or operator input feed this block. → historical_san_size_span → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## summarize_names + + + + + + +
+
def summarize_names(values: set[str], limit: int = 4) -> str:
+    if not values:
+        return "-"
+    ordered = sorted(values, key=str.casefold)
+    if len(ordered) <= limit:
+        return ", ".join(ordered)
+    return ", ".join(ordered[:limit]) + f", ... (+{len(ordered) - limit} more)"
+
+

What this block is doing

This block compresses many detailed rows into a smaller, easier-to-read summary.

+

Flow arrows

Earlier blocks or operator input feed this block. → summarize_names → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## zone_count_from_sans + + + + + + +
+
def zone_count_from_sans(san_entries: list[str]) -> int:
+    return len(
+        {
+            ct_scan.san_tail_split(entry[4:])[1]
+            for entry in san_entries
+            if entry.startswith("DNS:")
+        }
+    )
+
+

What this block is doing

This function is one of the building blocks inside `ct_focus_subjects.py`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine.

+

Flow arrows

Earlier blocks or operator input feed this block. → zone_count_from_sans → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## max_san_count_current + + + + + + +
+
def max_san_count_current(hits: list[ct_scan.CertificateHit]) -> int:
+    return max((len(hit.san_entries) for hit in hits), default=0)
+
+

What this block is doing

This function is one of the building blocks inside `ct_focus_subjects.py`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine.

+

Flow arrows

Earlier blocks or operator input feed this block. → max_san_count_current → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## max_san_count_historical + + + + + + +
+
def max_san_count_historical(certificates: list[ct_lineage_report.HistoricalCertificate]) -> int:
+    return max((len(certificate.san_entries) for certificate in certificates), default=0)
+
+

What this block is doing

This function is one of the building blocks inside `ct_focus_subjects.py`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine.

+

Flow arrows

Earlier blocks or operator input feed this block. → max_san_count_historical → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## max_zone_count_current + + + + + + +
+
def max_zone_count_current(hits: list[ct_scan.CertificateHit]) -> int:
+    return max((zone_count_from_sans(hit.san_entries) for hit in hits), default=0)
+
+

What this block is doing

This function is one of the building blocks inside `ct_focus_subjects.py`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine.

+

Flow arrows

Earlier blocks or operator input feed this block. → max_zone_count_current → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## bucket_sort_key + + + + + + +
+
def bucket_sort_key(value: str) -> tuple[int, str]:
+    order = {
+        "direct_front_door": 0,
+        "platform_matrix_anchor": 1,
+        "ambiguous_legacy": 2,
+    }
+    return (order.get(value, 99), value)
+
+

What this block is doing

This function is one of the building blocks inside `ct_focus_subjects.py`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine.

+

Flow arrows

Earlier blocks or operator input feed this block. → bucket_sort_key → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## taxonomy_bucket_label + + + + + + +
+
def taxonomy_bucket_label(bucket: str) -> str:
+    return {
+        "direct_front_door": "Front-door direct name",
+        "platform_matrix_anchor": "Platform-anchor matrix name",
+        "ambiguous_legacy": "Ambiguous or legacy residue",
+    }.get(bucket, bucket)
+
+

What this block is doing

This function is one of the building blocks inside `ct_focus_subjects.py`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine.

+

Flow arrows

Earlier blocks or operator input feed this block. → taxonomy_bucket_label → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## analyst_theme + + + + + + +
+
def analyst_theme(subject: FocusSubject) -> str:
+    tokens = set(re.findall(r"[a-z0-9]+", f"{subject.subject_cn} {subject.analyst_note}".lower()))
+    if ENVIRONMENT_HINTS & tokens:
+        return "environment or platform anchor"
+    if VENDOR_HINTS & tokens:
+        return "vendor or product integration"
+    if IDENTITY_HINTS & tokens:
+        return "identity, messaging, or service front"
+    if CUSTOMER_HINTS & tokens:
+        return "customer proposition or campaign front"
+    left_label = subject.subject_cn.split(".")[0].lower()
+    if re.fullmatch(r"\d+", left_label) or re.fullmatch(r"[a-z]{2,6}\d{1,4}", left_label):
+        return "opaque or legacy label"
+    return "human-named branded or service endpoint"
+
+

What this block is doing

This function is one of the building blocks inside `ct_focus_subjects.py`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine.

+

Flow arrows

Earlier blocks or operator input feed this block. → analyst_theme → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## classify_taxonomy_bucket + + + + + + +
+
def classify_taxonomy_bucket(
+    subject: FocusSubject,
+    current_hits: list[ct_scan.CertificateHit],
+    historical_hits: list[ct_lineage_report.HistoricalCertificate],
+    current_carriers: list[ct_scan.CertificateHit],
+    historical_carriers: list[ct_lineage_report.HistoricalCertificate],
+) -> tuple[str, str]:
+    tokens = set(re.findall(r"[a-z0-9]+", f"{subject.subject_cn} {subject.analyst_note}".lower()))
+    left_label = subject.subject_cn.split(".")[0].lower()
+    opaque_label = bool(
+        re.fullmatch(r"\d+", left_label)
+        or re.fullmatch(r"[a-z]{1,4}\d{1,4}", left_label)
+    )
+    current_direct_exists = bool(current_hits)
+    historical_direct_exists = bool(historical_hits)
+    max_current_sans = max_san_count_current(current_hits)
+    max_historical_sans = max_san_count_historical(historical_hits)
+    max_any_sans = max(max_current_sans, max_historical_sans)
+    max_current_zones = max_zone_count_current(current_hits)
+    carrier_only_today = not current_direct_exists and bool(current_carriers)
+    carrier_only_history = (not current_direct_exists and not historical_direct_exists and bool(historical_carriers))
+    environment_signal = bool(ENVIRONMENT_HINTS & tokens)
+
+    if max_any_sans >= 20:
+        return (
+            "platform_matrix_anchor",
+            "Large SAN matrix coverage indicates an umbrella certificate for a managed platform slice rather than one standalone public front door.",
+        )
+    if carrier_only_today or carrier_only_history:
+        return (
+            "ambiguous_legacy",
+            "This name now appears mainly as a carried SAN passenger or as historical residue, so it no longer behaves like a stable standalone certificate front.",
+        )
+    if current_direct_exists and max_any_sans <= 4 and max_current_zones <= 1 and not opaque_label and not environment_signal:
+        return (
+            "direct_front_door",
+            "Small direct certificates, single-zone scope, and a human-readable service label fit the pattern of a branded or service-facing public entry point.",
+        )
+    if historical_direct_exists and not current_direct_exists and max_any_sans <= 4 and not opaque_label:
+        return (
+            "ambiguous_legacy",
+            "The historical certificates look like a simple direct front, but there is no current direct certificate anymore, which makes this mostly migration residue rather than a live front-door pattern.",
+        )
+    if max_any_sans <= 4 and opaque_label:
+        return (
+            "ambiguous_legacy",
+            "The direct certificate shape is small and simple, but the left-most label is too opaque to treat as a clear branded or service-front naming pattern.",
+        )
+    if environment_signal and max_any_sans <= 19:
+        return (
+            "ambiguous_legacy",
+            "Environment-style wording is present, but the SAN coverage is not broad enough to prove a full platform-matrix certificate role.",
+        )
+    if max_any_sans > 4:
+        return (
+            "ambiguous_legacy",
+            "Direct issuance exists, but the SAN set is broader or more variable than a simple one-service front, which leaves the role mixed.",
+        )
+    return (
+        "ambiguous_legacy",
+        "The evidence is mixed or too thin to place this name cleanly in one of the stronger bucket patterns.",
+    )
+
+

What this block is doing

Places a name into the direct-front, platform-anchor, or ambiguous bucket.

+

Flow arrows

One focused Subject CN plus surrounding evidence. → classify_taxonomy_bucket → `build_analysis` uses the bucket label in the focused-cohort chapter.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## observed_role + + + + + + +
+
def observed_role(
+    subject: FocusSubject,
+    current_hits: list[ct_scan.CertificateHit],
+    current_carriers: list[ct_scan.CertificateHit],
+    historical_carriers: list[ct_lineage_report.HistoricalCertificate],
+    observation: ct_dns_utils.DnsObservation,
+) -> str:
+    tokens = set(re.findall(r"[a-z0-9]+", f"{subject.subject_cn} {subject.analyst_note}".lower()))
+    if not current_hits and current_carriers:
+        return "carried today inside another certificate"
+    if not current_hits and historical_carriers:
+        return "historical carried alias or retired passenger"
+    if not current_hits:
+        return "not seen in the CT corpus"
+    max_san_entries = max(len(hit.san_entries) for hit in current_hits)
+    if max_san_entries >= 20 or (ENVIRONMENT_HINTS & tokens):
+        return "platform matrix or environment anchor"
+    revoked = sum(1 for hit in current_hits if hit.revocation_status == "revoked")
+    if revoked >= 3:
+        return "high-churn direct service front"
+    if VENDOR_HINTS & tokens:
+        return "direct vendor or product integration front"
+    if IDENTITY_HINTS & tokens:
+        return "direct service or identity front"
+    if CUSTOMER_HINTS & tokens:
+        return "direct branded or customer proposition front"
+    if observation.classification in {"direct_address", "cname_to_address"}:
+        return "direct standalone service front"
+    return "standalone branded or service endpoint"
+
+

What this block is doing

Tries to describe what role the name appears to play in the public estate.

+

Flow arrows

One focused Subject CN plus public evidence. → observed_role → `build_analysis` stores the plain-English role description.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## basket_status + + + + + + +
+
def basket_status(
+    current_hits: list[ct_scan.CertificateHit],
+    current_carriers: list[ct_scan.CertificateHit],
+    historical_hits: list[ct_lineage_report.HistoricalCertificate],
+    historical_carriers: list[ct_lineage_report.HistoricalCertificate],
+) -> str:
+    if current_hits and current_carriers:
+        return "current direct-and-carried overlap"
+    if current_hits:
+        return "current direct subject certificate"
+    if current_carriers:
+        return "current SAN passenger only"
+    if historical_hits and historical_carriers:
+        return "historical direct-and-carried only"
+    if historical_hits:
+        return "historical direct only"
+    if historical_carriers:
+        return "historical SAN passenger only"
+    return "not seen"
+
+

What this block is doing

This function is one of the building blocks inside `ct_focus_subjects.py`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine.

+

Flow arrows

Earlier blocks or operator input feed this block. → basket_status → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## red_flag_text + + + + + + +
+
def red_flag_text(row_lookup: dict[str, str], subject_cn: str) -> str:
+    return row_lookup.get(subject_cn.lower(), "-")
+
+

What this block is doing

This function is one of the building blocks inside `ct_focus_subjects.py`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine.

+

Flow arrows

Earlier blocks or operator input feed this block. → red_flag_text → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## build_analysis + + + + + + +
+
def build_analysis(
+    subjects: list[FocusSubject],
+    report: dict[str, object],
+    assessment: ct_lineage_report.HistoricalAssessment,
+    dns_cache_dir: Path,
+    dns_cache_ttl_seconds: int,
+) -> FocusCohortAnalysis | None:
+    if not subjects:
+        return None
+    focus_set = {subject.subject_cn for subject in subjects}
+
+    current_hits = report["hits"]
+    current_by_cn: dict[str, list[ct_scan.CertificateHit]] = {}
+    for hit in current_hits:
+        current_by_cn.setdefault(hit.subject_cn.lower(), []).append(hit)
+
+    historical_by_cn: dict[str, list[ct_lineage_report.HistoricalCertificate]] = {}
+    for certificate in assessment.certificates:
+        historical_by_cn.setdefault(certificate.subject_cn.lower(), []).append(certificate)
+
+    non_focus_current = [hit for hit in current_hits if hit.subject_cn.lower() not in focus_set]
+    non_focus_historical = [certificate for certificate in assessment.certificates if certificate.subject_cn.lower() not in focus_set]
+
+    observation_by_name = report["observation_by_name"]
+    detail_rows: list[FocusSubjectDetail] = []
+    transition_rows: list[FocusSubjectDetail] = []
+
+    current_red_flag_lookup = {row.subject_cn.lower(): row.flags for row in assessment.current_red_flag_rows}
+    past_red_flag_lookup = {row.subject_cn.lower(): row.flags for row in assessment.past_red_flag_rows}
+
+    for subject in subjects:
+        current_direct = current_by_cn.get(subject.subject_cn, [])
+        historical_direct = historical_by_cn.get(subject.subject_cn, [])
+        current_carriers = [hit for hit in non_focus_current if subject.subject_cn in dns_names(hit.san_entries)]
+        historical_carriers = [
+            certificate
+            for certificate in non_focus_historical
+            if subject.subject_cn in dns_names(certificate.san_entries)
+        ]
+        observation = observation_by_name.get(subject.subject_cn) or ct_dns_utils.scan_name_cached(
+            subject.subject_cn,
+            dns_cache_dir,
+            dns_cache_ttl_seconds,
+        )
+        current_issuer_families = Counter(
+            short_issuer_family(ct_scan.primary_issuer_name(hit))
+            for hit in current_direct
+        )
+        historical_issuer_families = Counter(
+            certificate.issuer_family
+            for certificate in historical_direct
+        )
+        max_overlap = 0
+        for direct_certificate in historical_direct:
+            for carrier_certificate in historical_carriers:
+                max_overlap = max(
+                    max_overlap,
+                    overlap_days(
+                        direct_certificate.validity_not_before,
+                        direct_certificate.effective_not_after,
+                        carrier_certificate.validity_not_before,
+                        carrier_certificate.effective_not_after,
+                    ),
+                )
+        taxonomy_bucket, taxonomy_reason = classify_taxonomy_bucket(
+            subject,
+            current_direct,
+            historical_direct,
+            current_carriers,
+            historical_carriers,
+        )
+        detail = FocusSubjectDetail(
+            subject_cn=subject.subject_cn,
+            analyst_note=subject.analyst_note or "-",
+            analyst_theme=analyst_theme(subject),
+            taxonomy_bucket=taxonomy_bucket,
+            taxonomy_reason=taxonomy_reason,
+            observed_role=observed_role(subject, current_direct, current_carriers, historical_carriers, observation),
+            basket_status=basket_status(current_direct, current_carriers, historical_direct, historical_carriers),
+            current_direct_certificates=len(current_direct),
+            historical_direct_certificates=len(historical_direct),
+            current_non_focus_san_carriers=len(current_carriers),
+            historical_non_focus_san_carriers=len(historical_carriers),
+            current_revoked_certificates=sum(1 for hit in current_direct if hit.revocation_status == "revoked"),
+            current_not_revoked_certificates=sum(1 for hit in current_direct if hit.revocation_status == "not_revoked"),
+            current_dns_outcome=observation.stack_signature,
+            current_dns_classification=observation.classification,
+            current_issuer_families=", ".join(
+                f"{name} ({count})"
+                for name, count in current_issuer_families.most_common()
+            ) or "-",
+            historical_issuer_families=", ".join(
+                f"{name} ({count})"
+                for name, count in historical_issuer_families.most_common()
+            ) or "-",
+            current_san_size_span=san_size_span(current_direct),
+            historical_san_size_span=historical_san_size_span(historical_direct),
+            max_direct_to_carrier_overlap_days=max_overlap,
+            carrier_subjects=summarize_names({hit.subject_cn for hit in current_carriers} | {certificate.subject_cn for certificate in historical_carriers}),
+            current_red_flags=red_flag_text(current_red_flag_lookup, subject.subject_cn),
+            past_red_flags=red_flag_text(past_red_flag_lookup, subject.subject_cn),
+        )
+        detail_rows.append(detail)
+        if detail.current_non_focus_san_carriers or detail.historical_non_focus_san_carriers:
+            transition_rows.append(detail)
+
+    focus_current_hits = [hit for hit in current_hits if hit.subject_cn.lower() in focus_set]
+    rest_current_hits = [hit for hit in current_hits if hit.subject_cn.lower() not in focus_set]
+
+    def zone_count(hit: ct_scan.CertificateHit) -> int:
+        return len({ct_scan.san_tail_split(entry[4:])[1] for entry in hit.san_entries if entry.startswith("DNS:")})
+
+    focus_current_subject_names = sorted({hit.subject_cn.lower() for hit in focus_current_hits})
+    rest_current_subject_names = sorted({hit.subject_cn.lower() for hit in rest_current_hits})
+
+    def observation_for_subject(name: str) -> ct_dns_utils.DnsObservation:
+        return observation_by_name.get(name) or ct_dns_utils.scan_name_cached(name, dns_cache_dir, dns_cache_ttl_seconds)
+
+    focus_current_subject_observations = [observation_for_subject(name) for name in focus_current_subject_names]
+    rest_current_subject_observations = [observation_for_subject(name) for name in rest_current_subject_names]
+
+    focus_current_issuer_families = Counter(
+        short_issuer_family(ct_scan.primary_issuer_name(hit))
+        for hit in focus_current_hits
+    )
+    rest_current_issuer_families = Counter(
+        short_issuer_family(ct_scan.primary_issuer_name(hit))
+        for hit in rest_current_hits
+    )
+
+    current_red_flag_subjects = {row.subject_cn.lower() for row in assessment.current_red_flag_rows}
+    past_red_flag_subjects = {row.subject_cn.lower() for row in assessment.past_red_flag_rows}
+
+    notables = sorted(
+        detail_rows,
+        key=lambda item: (
+            bucket_sort_key(item.taxonomy_bucket),
+            -(
+                (item.current_revoked_certificates > 0)
+                + (item.current_non_focus_san_carriers > 0)
+                + (item.historical_non_focus_san_carriers > 0)
+                + (item.current_red_flags != "-")
+                + (item.past_red_flags != "-")
+            ),
+            -item.current_direct_certificates,
+            item.subject_cn,
+        ),
+    )[:10]
+
+    return FocusCohortAnalysis(
+        focus_subjects=subjects,
+        details=sorted(detail_rows, key=lambda item: (bucket_sort_key(item.taxonomy_bucket), item.subject_cn.casefold())),
+        provided_subjects_count=len(subjects),
+        historically_seen_subjects_count=sum(
+            1
+            for item in detail_rows
+            if item.historical_direct_certificates > 0 or item.historical_non_focus_san_carriers > 0
+        ),
+        current_direct_subjects_count=sum(1 for item in detail_rows if item.current_direct_certificates > 0),
+        current_carried_only_subjects_count=sum(
+            1
+            for item in detail_rows
+            if item.current_direct_certificates == 0 and item.current_non_focus_san_carriers > 0
+        ),
+        historical_non_focus_carried_subjects_count=sum(
+            1
+            for item in detail_rows
+            if item.historical_non_focus_san_carriers > 0
+        ),
+        unseen_subjects=[item.subject_cn for item in detail_rows if item.basket_status == "not seen"],
+        current_focus_certificate_count=len(focus_current_hits),
+        current_rest_certificate_count=len(rest_current_hits),
+        focus_revoked_current_count=sum(1 for hit in focus_current_hits if hit.revocation_status == "revoked"),
+        focus_not_revoked_current_count=sum(1 for hit in focus_current_hits if hit.revocation_status == "not_revoked"),
+        rest_revoked_current_count=sum(1 for hit in rest_current_hits if hit.revocation_status == "revoked"),
+        rest_not_revoked_current_count=sum(1 for hit in rest_current_hits if hit.revocation_status == "not_revoked"),
+        focus_revoked_share=pct(
+            sum(1 for hit in focus_current_hits if hit.revocation_status == "revoked"),
+            len(focus_current_hits),
+        ),
+        rest_revoked_share=pct(
+            sum(1 for hit in rest_current_hits if hit.revocation_status == "revoked"),
+            len(rest_current_hits),
+        ),
+        focus_median_san_entries=median_int([len(hit.san_entries) for hit in focus_current_hits]),
+        focus_average_san_entries=average_text([len(hit.san_entries) for hit in focus_current_hits]),
+        rest_median_san_entries=median_int([len(hit.san_entries) for hit in rest_current_hits]),
+        rest_average_san_entries=average_text([len(hit.san_entries) for hit in rest_current_hits]),
+        focus_multi_zone_certificate_count=sum(1 for hit in focus_current_hits if zone_count(hit) > 1),
+        rest_multi_zone_certificate_count=sum(1 for hit in rest_current_hits if zone_count(hit) > 1),
+        focus_current_subject_dns_classes=Counter(observation.classification for observation in focus_current_subject_observations),
+        rest_current_subject_dns_classes=Counter(observation.classification for observation in rest_current_subject_observations),
+        focus_current_subject_dns_stacks=Counter(observation.stack_signature for observation in focus_current_subject_observations),
+        rest_current_subject_dns_stacks=Counter(observation.stack_signature for observation in rest_current_subject_observations),
+        focus_current_issuer_families=focus_current_issuer_families,
+        rest_current_issuer_families=rest_current_issuer_families,
+        focus_current_red_flag_subjects=sum(1 for subject in subjects if subject.subject_cn in current_red_flag_subjects),
+        focus_past_red_flag_subjects=sum(1 for subject in subjects if subject.subject_cn in past_red_flag_subjects),
+        focus_any_red_flag_subjects=sum(
+            1
+            for subject in subjects
+            if subject.subject_cn in current_red_flag_subjects or subject.subject_cn in past_red_flag_subjects
+        ),
+        bucket_counts=Counter(item.taxonomy_bucket for item in detail_rows),
+        notables=notables,
+        transition_rows=sorted(
+            transition_rows,
+            key=lambda item: (
+                -(item.current_non_focus_san_carriers + item.historical_non_focus_san_carriers),
+                -item.max_direct_to_carrier_overlap_days,
+                item.subject_cn.casefold(),
+            ),
+        ),
+    )
+
+

What this block is doing

Runs the full comparison between the focused cohort and the rest of the estate.

+

Flow arrows

The focus-subject list, current-state report, and historical assessment. → build_analysis → The monograph uses the resulting bundle for Chapter 8 and Appendix D.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ diff --git a/teachingNoobs/ct_lineage_report.md b/teachingNoobs/ct_lineage_report.md new file mode 100644 index 0000000..f9c972f --- /dev/null +++ b/teachingNoobs/ct_lineage_report.md @@ -0,0 +1,2184 @@ +# ct_lineage_report.py + +Source file: [`ct_lineage_report.py`](../ct_lineage_report.py) + +Historical analyzer. This file studies expired plus current certificates to find renewals, overlap, drift, and issuance bursts over time. + +Main flow in one line: `historical CT rows -> historical certificates -> grouped by Subject CN -> overlap and drift checks -> red flags` + +How to read this page: + +- left side: the actual source code block +- right side: a plain-English explanation for a beginner +- read from top to bottom because later blocks depend on earlier ones + +## Module setup + + + + + + +
+
#!/usr/bin/env python3
+
+from __future__ import annotations
+
+import argparse
+import hashlib
+from collections import Counter, defaultdict
+from dataclasses import dataclass, field
+from datetime import UTC, date, datetime, timedelta
+from pathlib import Path
+from typing import Any
+
+from cryptography import x509
+from cryptography.x509.oid import NameOID
+from psycopg.rows import dict_row
+
+import ct_scan
+
+
+HISTORICAL_QUERY_SQL = """
+WITH ci AS (
+    SELECT
+        min(sub.certificate_id) AS id,
+        min(sub.issuer_ca_id) AS issuer_ca_id,
+        x509_commonName(sub.certificate) AS common_name,
+        x509_subjectName(sub.certificate) AS subject_dn,
+        x509_notBefore(sub.certificate) AS not_before,
+        x509_notAfter(sub.certificate) AS not_after,
+        encode(x509_serialNumber(sub.certificate), 'hex') AS serial_number,
+        sub.certificate AS certificate
+    FROM (
+        SELECT cai.*
+        FROM certificate_and_identities cai
+        WHERE plainto_tsquery('certwatch', %(domain)s) @@ identities(cai.certificate)
+          AND cai.name_value ILIKE %(name_pattern)s ESCAPE '\\'
+        LIMIT %(max_candidates)s
+    ) sub
+    GROUP BY sub.certificate
+)
+SELECT
+    ci.id,
+    ci.issuer_ca_id,
+    ca.name AS issuer_name,
+    ci.common_name,
+    ci.subject_dn,
+    ci.not_before,
+    ci.not_after,
+    cl.first_seen,
+    ci.serial_number,
+    coalesce(cl.revoked, 0) AS revoked_count,
+    rev.revocation_date,
+    rev.reason_code,
+    rev.last_seen_check_date,
+    crl_state.active_crl_count,
+    crl_state.last_checked AS crl_last_checked,
+    ci.certificate
+FROM ci
+JOIN ca ON ca.id = ci.issuer_ca_id
+JOIN certificate_lifecycle cl ON cl.certificate_id = ci.id
+LEFT JOIN LATERAL (
+    SELECT
+        cr.revocation_date,
+        cr.reason_code,
+        cr.last_seen_check_date
+    FROM crl_revoked cr
+    WHERE cr.ca_id = ci.issuer_ca_id
+      AND cr.serial_number = decode(ci.serial_number, 'hex')
+    ORDER BY cr.last_seen_check_date DESC NULLS LAST
+    LIMIT 1
+) rev ON TRUE
+LEFT JOIN LATERAL (
+    SELECT
+        count(*) FILTER (
+            WHERE crl.error_message IS NULL
+              AND crl.next_update > now() AT TIME ZONE 'UTC'
+        ) AS active_crl_count,
+        max(crl.last_checked) AS last_checked
+    FROM crl
+    WHERE crl.ca_id = ci.issuer_ca_id
+) crl_state ON TRUE
+WHERE cl.certificate_type = 'Certificate'
+ORDER BY ci.not_before ASC, cl.first_seen ASC NULLS LAST, ci.id ASC;
+"""
+
+
+ENV_TOKENS = {
+    "api",
+    "auth",
+    "developer",
+    "webbanking",
+    "sandbox",
+    "dev",
+    "test",
+    "qa",
+    "uat",
+    "preprod",
+    "prod",
+    "stage",
+    "stg",
+    "release",
+    "replica",
+    "support",
+    "hotfix",
+    "monitoring",
+    "mail",
+    "statement",
+    "update",
+    "secure",
+}
+
+

What this block is doing

Historical query logic, data structures, and red-flag rules for certificate lifecycle analysis.

+

Flow arrows

Earlier blocks or operator input feed this block. → Module setup → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## HistoricalCertificate + + + + + + +
+
@dataclass
+class HistoricalCertificate:
+    fingerprint_sha256: str
+    subject_cn: str
+    subject_dn: str
+    issuer_name: str
+    issuer_family: str
+    validity_not_before: datetime
+    validity_not_after: datetime
+    effective_not_after: datetime
+    san_entries: list[str]
+    first_seen: datetime | None
+    current: bool
+    revocation_status: str
+    revocation_date: datetime | None
+    matched_domains: set[str] = field(default_factory=set)
+    crtsh_certificate_ids: set[int] = field(default_factory=set)
+    serial_numbers: set[str] = field(default_factory=set)
+
+

What this block is doing

One certificate in the full time-based dataset, including expired ones.

+

Flow arrows

Earlier blocks or operator input feed this block. → HistoricalCertificate → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## CnCollisionRow + + + + + + +
+
@dataclass
+class CnCollisionRow:
+    subject_cn: str
+    certificate_count: int
+    current_certificate_count: int
+    distinct_value_count: int
+    issuer_families: str
+    details: str
+
+

What this block is doing

A table row for Subject-DN drift or issuer drift under the same Subject CN.

+

Flow arrows

Earlier blocks or operator input feed this block. → CnCollisionRow → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## SanChangeRow + + + + + + +
+
@dataclass
+class SanChangeRow:
+    subject_cn: str
+    certificate_count: int
+    current_certificate_count: int
+    distinct_san_profiles: int
+    stable_entries: int
+    variable_entries: int
+    delta_pattern: str
+    representative_delta: str
+
+

What this block is doing

A table row that describes SAN-profile change for one Subject CN.

+

Flow arrows

Earlier blocks or operator input feed this block. → SanChangeRow → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## StartDayRow + + + + + + +
+
@dataclass
+class StartDayRow:
+    start_day: str
+    certificate_count: int
+    top_subjects: str
+    top_issuers: str
+
+

What this block is doing

This class is a structured container for one piece of data that later code passes around instead of juggling many loose variables.

+

Flow arrows

Earlier blocks or operator input feed this block. → StartDayRow → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## StepWeekRow + + + + + + +
+
@dataclass
+class StepWeekRow:
+    week_start: str
+    certificate_count: int
+    prior_eight_week_avg: str
+    top_subjects: str
+    top_issuers: str
+
+

What this block is doing

This class is a structured container for one piece of data that later code passes around instead of juggling many loose variables.

+

Flow arrows

Earlier blocks or operator input feed this block. → StepWeekRow → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## OverlapRow + + + + + + +
+
@dataclass
+class OverlapRow:
+    subject_cn: str
+    asset_variant_count: int
+    current_certificate_count: int
+    lineage: str
+    max_concurrent: int
+    max_overlap_days: int
+    overlap_class: str
+    details: str
+
+

What this block is doing

A table row describing long predecessor/successor overlap.

+

Flow arrows

Earlier blocks or operator input feed this block. → OverlapRow → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## RedFlagRow + + + + + + +
+
@dataclass
+class RedFlagRow:
+    subject_cn: str
+    score: int
+    certificate_count: int
+    current_certificate_count: int
+    flags: str
+    notes: str
+
+

What this block is doing

A compact summary row for names worth attention.

+

Flow arrows

Earlier blocks or operator input feed this block. → RedFlagRow → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## HistoricalAssessment + + + + + + +
+
@dataclass
+class HistoricalAssessment:
+    domains: list[str]
+    certificates: list[HistoricalCertificate]
+    cn_groups: dict[str, list[HistoricalCertificate]]
+    dn_rows: list[CnCollisionRow]
+    dn_current_rows: list[CnCollisionRow]
+    dn_past_rows: list[CnCollisionRow]
+    issuer_rows: list[CnCollisionRow]
+    vendor_rows: list[CnCollisionRow]
+    vendor_current_rows: list[CnCollisionRow]
+    vendor_past_rows: list[CnCollisionRow]
+    san_rows: list[SanChangeRow]
+    san_current_rows: list[SanChangeRow]
+    san_past_rows: list[SanChangeRow]
+    san_pattern_counts: Counter[str]
+    overlap_current_rows: list[OverlapRow]
+    overlap_past_rows: list[OverlapRow]
+    normal_reissuance_assets: int
+    repeated_asset_count: int
+    current_red_flag_rows: list[RedFlagRow]
+    past_red_flag_rows: list[RedFlagRow]
+    day_rows: list[StartDayRow]
+    week_rows: list[StepWeekRow]
+
+

What this block is doing

The full historical analysis bundle used by the monograph.

+

Flow arrows

Earlier blocks or operator input feed this block. → HistoricalAssessment → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## parse_args + + + + + + +
+
def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Analyse historical certificate lineage, CN reuse, issuer drift, SAN drift, and issuance bursts."
+    )
+    parser.add_argument("--domains-file", type=Path, default=Path("domains.local.txt"))
+    parser.add_argument("--cache-dir", type=Path, default=Path(".cache/ct-history-v2"))
+    parser.add_argument("--cache-ttl-seconds", type=int, default=0)
+    parser.add_argument("--max-candidates-per-domain", type=int, default=10000)
+    parser.add_argument("--retries", type=int, default=3)
+    parser.add_argument(
+        "--markdown-output",
+        type=Path,
+        default=Path("output/corpus/certificate-lineage-report.md"),
+    )
+    parser.add_argument(
+        "--latex-output",
+        type=Path,
+        default=Path("output/corpus/certificate-lineage-report.tex"),
+    )
+    parser.add_argument(
+        "--pdf-output",
+        type=Path,
+        default=Path("output/corpus/certificate-lineage-report.pdf"),
+    )
+    parser.add_argument("--skip-pdf", action="store_true")
+    parser.add_argument("--pdf-engine", default="xelatex")
+    parser.add_argument("--quiet", action="store_true")
+    return parser.parse_args()
+
+

What this block is doing

This block defines the command-line knobs for the file: input paths, cache settings, output paths, and other runtime switches.

+

Flow arrows

Earlier blocks or operator input feed this block. → parse_args → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## short_issuer + + + + + + +
+
def short_issuer(issuer_name: str) -> str:
+    lowered = issuer_name.lower()
+    if "amazon" in lowered:
+        return "Amazon"
+    if "sectigo" in lowered or "comodo" in lowered:
+        return "Sectigo/COMODO"
+    if "digicert" in lowered:
+        return "DigiCert"
+    if "symantec" in lowered:
+        return "Symantec"
+    if "verisign" in lowered:
+        return "VeriSign"
+    if "cloudflare" in lowered:
+        return "Cloudflare"
+    if "google trust services" in lowered or "cn=we1" in lowered:
+        return "Google Trust Services"
+    return issuer_name
+
+

What this block is doing

This function is one of the building blocks inside `ct_lineage_report.py`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine.

+

Flow arrows

Earlier blocks or operator input feed this block. → short_issuer → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## pct + + + + + + +
+
def pct(count: int, total: int) -> str:
+    if total <= 0:
+        return "0.0%"
+    return f"{(count / total) * 100:.1f}%"
+
+

What this block is doing

This is a small helper that keeps the larger analytical code cleaner and easier to reuse.

+

Flow arrows

Earlier blocks or operator input feed this block. → pct → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## md_table + + + + + + +
+
def md_table(headers: list[str], rows: list[list[str]]) -> list[str]:
+    lines = [
+        "| " + " | ".join(headers) + " |",
+        "| " + " | ".join(["---"] * len(headers)) + " |",
+    ]
+    for row in rows:
+        lines.append("| " + " | ".join(row) + " |")
+    return lines
+
+

What this block is doing

This function is one of the building blocks inside `ct_lineage_report.py`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine.

+

Flow arrows

Earlier blocks or operator input feed this block. → md_table → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## extract_common_name + + + + + + +
+
def extract_common_name(cert: x509.Certificate) -> str | None:
+    attributes = cert.subject.get_attributes_for_oid(NameOID.COMMON_NAME)
+    if not attributes:
+        return None
+    return attributes[0].value
+
+

What this block is doing

This block pulls one specific piece of information out of a larger object.

+

Flow arrows

Earlier blocks or operator input feed this block. → extract_common_name → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## query_historical_domain + + + + + + +
+
def query_historical_domain(domain: str, max_candidates: int, attempts: int, quiet: bool) -> list[ct_scan.DatabaseRecord]:
+    raw_match_count = ct_scan.query_raw_match_count(domain=domain, attempts=attempts, verbose=not quiet)
+    if raw_match_count > max_candidates:
+        raise ValueError(
+            f"domain={domain} raw identity matches={raw_match_count} exceed max_candidates={max_candidates}; "
+            f"increase --max-candidates-per-domain to at least {raw_match_count} for a complete result set"
+        )
+    params = {
+        "domain": domain,
+        "name_pattern": f"%{ct_scan.escape_like(domain)}%",
+        "max_candidates": max_candidates,
+    }
+    last_error: Exception | None = None
+    for attempt in range(1, attempts + 1):
+        try:
+            with ct_scan.connect() as conn, conn.cursor(row_factory=dict_row) as cur:
+                cur.execute(HISTORICAL_QUERY_SQL, params)
+                rows = cur.fetchall()
+            return [ct_scan.row_to_record(domain, row) for row in rows]
+        except Exception as exc:
+            last_error = exc
+            if attempt == attempts:
+                break
+            if not quiet:
+                print(
+                    f"[warn] historical domain={domain} attempt={attempt}/{attempts} failed: {exc}",
+                    file=__import__("sys").stderr,
+                )
+            __import__("time").sleep(min(2 ** attempt, 10))
+    assert last_error is not None
+    raise last_error
+
+

What this block is doing

Fetches the wider historical corpus for one search term.

+

Flow arrows

A configured search domain. → query_historical_domain → `load_records` uses it to build the wider historical corpus.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## load_records + + + + + + +
+
def load_records(args: argparse.Namespace) -> tuple[list[str], list[ct_scan.DatabaseRecord]]:
+    domains = ct_scan.load_domains(args.domains_file)
+    all_records: list[ct_scan.DatabaseRecord] = []
+    for domain in domains:
+        cached = ct_scan.load_cached_records(
+            cache_dir=args.cache_dir,
+            domain=domain,
+            ttl_seconds=args.cache_ttl_seconds,
+            max_candidates=args.max_candidates_per_domain,
+        )
+        if cached is not None:
+            if not args.quiet:
+                print(f"[cache] historical domain={domain} records={len(cached)}", file=__import__("sys").stderr)
+            all_records.extend(cached)
+            continue
+        if not args.quiet:
+            print(f"[query] historical domain={domain}", file=__import__("sys").stderr)
+        queried = query_historical_domain(
+            domain=domain,
+            max_candidates=args.max_candidates_per_domain,
+            attempts=args.retries,
+            quiet=args.quiet,
+        )
+        ct_scan.store_cached_records(args.cache_dir, domain, args.max_candidates_per_domain, queried)
+        all_records.extend(queried)
+    return domains, all_records
+
+

What this block is doing

This block loads data from disk, cache, or an earlier stage so later code can work with it.

+

Flow arrows

Earlier blocks or operator input feed this block. → load_records → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## build_certificates + + + + + + +
+
def build_certificates(records: list[ct_scan.DatabaseRecord]) -> list[HistoricalCertificate]:
+    now = datetime.now(UTC).replace(tzinfo=None)
+    by_fingerprint: dict[str, HistoricalCertificate] = {}
+    for record in records:
+        cert = x509.load_der_x509_certificate(record.certificate_der)
+        is_leaf, _reason = ct_scan.is_leaf_certificate(cert)
+        if not is_leaf:
+            continue
+        fingerprint_sha256 = hashlib.sha256(record.certificate_der).hexdigest()
+        hit = by_fingerprint.get(fingerprint_sha256)
+        if hit is None:
+            subject_cn = record.common_name or extract_common_name(cert) or "-"
+            revocation_status, revocation_date, _revocation_reason, _crtsh_crl_timestamp, _revocation_note = ct_scan.revocation_fields(record)
+            effective_not_after = record.not_after
+            if revocation_status == "revoked" and revocation_date is not None and revocation_date < effective_not_after:
+                effective_not_after = revocation_date
+            hit = HistoricalCertificate(
+                fingerprint_sha256=fingerprint_sha256,
+                subject_cn=subject_cn,
+                subject_dn=cert.subject.rfc4514_string(),
+                issuer_name=record.issuer_name,
+                issuer_family=short_issuer(record.issuer_name),
+                validity_not_before=record.not_before,
+                validity_not_after=record.not_after,
+                effective_not_after=effective_not_after,
+                san_entries=ct_scan.extract_san_entries(cert),
+                first_seen=record.first_seen,
+                current=record.not_before <= now <= record.not_after,
+                revocation_status=revocation_status,
+                revocation_date=revocation_date,
+                matched_domains={record.domain},
+                crtsh_certificate_ids={record.certificate_id},
+                serial_numbers={record.serial_number},
+            )
+            by_fingerprint[fingerprint_sha256] = hit
+            continue
+        hit.matched_domains.add(record.domain)
+        hit.crtsh_certificate_ids.add(record.certificate_id)
+        hit.serial_numbers.add(record.serial_number)
+        if hit.first_seen is None or (record.first_seen is not None and record.first_seen < hit.first_seen):
+            hit.first_seen = record.first_seen
+    return sorted(
+        by_fingerprint.values(),
+        key=lambda item: (
+            item.subject_cn.casefold(),
+            item.validity_not_before,
+            item.fingerprint_sha256,
+        ),
+    )
+
+

What this block is doing

Converts raw DB rows into historical working objects.

+

Flow arrows

Historical `DatabaseRecord` rows. → build_certificates → `group_by_subject_cn` and all drift checks consume these normalized historical certificates.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## group_by_subject_cn + + + + + + +
+
def group_by_subject_cn(certificates: list[HistoricalCertificate]) -> dict[str, list[HistoricalCertificate]]:
+    groups: dict[str, list[HistoricalCertificate]] = defaultdict(list)
+    for certificate in certificates:
+        groups[certificate.subject_cn.lower()].append(certificate)
+    return groups
+
+

What this block is doing

This block clusters related items together so later code can analyze them as families instead of as isolated rows.

+

Flow arrows

Historical certificates. → group_by_subject_cn → `dn_change_rows`, `issuer_change_rows`, `san_change_rows`, and `overlap_rows` all work off this grouping.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## summarize_name_list + + + + + + +
+
def summarize_name_list(values: set[str], limit: int = 3) -> str:
+    ordered = sorted(values, key=str.casefold)
+    if len(ordered) <= limit:
+        return ", ".join(ordered)
+    return ", ".join(ordered[:limit]) + f", ... (+{len(ordered) - limit} more)"
+
+

What this block is doing

This block compresses many detailed rows into a smaller, easier-to-read summary.

+

Flow arrows

Earlier blocks or operator input feed this block. → summarize_name_list → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## family_counter + + + + + + +
+
def family_counter(values: list[HistoricalCertificate]) -> Counter[str]:
+    return Counter(item.issuer_family for item in values)
+
+

What this block is doing

This function is one of the building blocks inside `ct_lineage_report.py`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine.

+

Flow arrows

Earlier blocks or operator input feed this block. → family_counter → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## dn_change_rows + + + + + + +
+
def dn_change_rows(cn_groups: dict[str, list[HistoricalCertificate]]) -> list[CnCollisionRow]:
+    rows: list[CnCollisionRow] = []
+    for certificates in cn_groups.values():
+        dns = {item.subject_dn for item in certificates}
+        if len(dns) <= 1:
+            continue
+        subject_cn = min({item.subject_cn for item in certificates}, key=str.casefold)
+        rows.append(
+            CnCollisionRow(
+                subject_cn=subject_cn,
+                certificate_count=len(certificates),
+                current_certificate_count=sum(1 for item in certificates if item.current),
+                distinct_value_count=len(dns),
+                issuer_families=", ".join(
+                    f"{name} ({count})" for name, count in family_counter(certificates).most_common()
+                ),
+                details=summarize_name_list(dns, limit=2),
+            )
+        )
+    return sorted(
+        rows,
+        key=lambda item: (-item.distinct_value_count, -item.certificate_count, item.subject_cn.casefold()),
+    )
+
+

What this block is doing

Finds names whose formal Subject DN changed over time.

+

Flow arrows

CN-grouped historical certificates. → dn_change_rows → `build_assessment` uses these rows for Subject-DN drift sections.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## issuer_change_rows + + + + + + +
+
def issuer_change_rows(
+    cn_groups: dict[str, list[HistoricalCertificate]],
+) -> tuple[list[CnCollisionRow], list[CnCollisionRow]]:
+    exact_rows: list[CnCollisionRow] = []
+    vendor_rows: list[CnCollisionRow] = []
+    for certificates in cn_groups.values():
+        issuer_names = {item.issuer_name for item in certificates}
+        issuer_families = {item.issuer_family for item in certificates}
+        subject_cn = min({item.subject_cn for item in certificates}, key=str.casefold)
+        if len(issuer_names) > 1:
+            exact_rows.append(
+                CnCollisionRow(
+                    subject_cn=subject_cn,
+                    certificate_count=len(certificates),
+                    current_certificate_count=sum(1 for item in certificates if item.current),
+                    distinct_value_count=len(issuer_names),
+                    issuer_families=", ".join(
+                        f"{name} ({count})" for name, count in family_counter(certificates).most_common()
+                    ),
+                    details=summarize_name_list(issuer_names, limit=3),
+                )
+            )
+        if len(issuer_families) > 1:
+            vendor_rows.append(
+                CnCollisionRow(
+                    subject_cn=subject_cn,
+                    certificate_count=len(certificates),
+                    current_certificate_count=sum(1 for item in certificates if item.current),
+                    distinct_value_count=len(issuer_families),
+                    issuer_families=", ".join(
+                        f"{name} ({count})" for name, count in family_counter(certificates).most_common()
+                    ),
+                    details=summarize_name_list(issuer_families, limit=4),
+                )
+            )
+    ordering = lambda item: (-item.distinct_value_count, -item.certificate_count, item.subject_cn.casefold())
+    return (sorted(exact_rows, key=ordering), sorted(vendor_rows, key=ordering))
+
+

What this block is doing

Finds names whose issuing CA family changed over time.

+

Flow arrows

CN-grouped historical certificates. → issuer_change_rows → `build_assessment` uses these rows for CA-family drift sections.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## classify_san_delta + + + + + + +
+
def classify_san_delta(delta_entries: set[str]) -> str:
+    dns_names = [entry[4:] for entry in delta_entries if entry.startswith("DNS:")]
+    if not dns_names:
+        return "non-DNS SAN drift"
+    if all(name.startswith("www.") or f"www.{name}" in dns_names for name in dns_names):
+        return "www toggle"
+    zones = {ct_scan.san_tail_split(name)[1] for name in dns_names}
+    if len(zones) > 1:
+        return "cross-zone bridge change"
+    lowered = " ".join(dns_names).lower()
+    if any(token in lowered for token in ENV_TOKENS) or any(char.isdigit() for char in lowered):
+        return "environment or fleet change"
+    if len(dns_names) <= 3:
+        return "small alias change"
+    return "broad SAN redesign"
+
+

What this block is doing

This block applies rules and chooses a category label.

+

Flow arrows

Earlier blocks or operator input feed this block. → classify_san_delta → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## representative_delta + + + + + + +
+
def representative_delta(delta_entries: set[str]) -> str:
+    values = sorted(delta_entries, key=str.casefold)
+    if not values:
+        return "-"
+    if len(values) <= 4:
+        return ", ".join(values)
+    return ", ".join(values[:4]) + f", ... (+{len(values) - 4} more)"
+
+

What this block is doing

This function is one of the building blocks inside `ct_lineage_report.py`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine.

+

Flow arrows

Earlier blocks or operator input feed this block. → representative_delta → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## san_change_rows + + + + + + +
+
def san_change_rows(cn_groups: dict[str, list[HistoricalCertificate]]) -> tuple[list[SanChangeRow], Counter[str]]:
+    rows: list[SanChangeRow] = []
+    pattern_counts: Counter[str] = Counter()
+    for certificates in cn_groups.values():
+        profiles = {tuple(item.san_entries) for item in certificates}
+        if len(profiles) <= 1:
+            continue
+        subject_cn = min({item.subject_cn for item in certificates}, key=str.casefold)
+        profile_sets = [set(profile) for profile in profiles]
+        stable_entries = set.intersection(*profile_sets)
+        all_entries = set.union(*profile_sets)
+        delta_entries = all_entries - stable_entries
+        pattern = classify_san_delta(delta_entries)
+        pattern_counts[pattern] += 1
+        rows.append(
+            SanChangeRow(
+                subject_cn=subject_cn,
+                certificate_count=len(certificates),
+                current_certificate_count=sum(1 for item in certificates if item.current),
+                distinct_san_profiles=len(profiles),
+                stable_entries=len(stable_entries),
+                variable_entries=len(delta_entries),
+                delta_pattern=pattern,
+                representative_delta=representative_delta(delta_entries),
+            )
+        )
+    rows.sort(
+        key=lambda item: (
+            -item.distinct_san_profiles,
+            -item.variable_entries,
+            -item.certificate_count,
+            item.subject_cn.casefold(),
+        )
+    )
+    return rows, pattern_counts
+
+

What this block is doing

Finds names whose SAN bundle changed over time.

+

Flow arrows

CN-grouped historical certificates. → san_change_rows → `build_assessment` uses these rows for SAN-drift sections.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## overlap_days + + + + + + +
+
def overlap_days(left: HistoricalCertificate, right: HistoricalCertificate) -> int:
+    start = max(left.validity_not_before, right.validity_not_before)
+    end = min(left.effective_not_after, right.effective_not_after)
+    if end <= start:
+        return 0
+    return max(1, (end - start).days)
+
+

What this block is doing

This function is one of the building blocks inside `ct_lineage_report.py`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine.

+

Flow arrows

Earlier blocks or operator input feed this block. → overlap_days → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## overlap_class + + + + + + +
+
def overlap_class(days: int) -> str:
+    if days <= 0:
+        return "no overlap"
+    if days < 50:
+        return "normal rollover"
+    return "red flag (>=50 days)"
+
+

What this block is doing

This function is one of the building blocks inside `ct_lineage_report.py`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine.

+

Flow arrows

Earlier blocks or operator input feed this block. → overlap_class → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## build_asset_key + + + + + + +
+
def build_asset_key(certificate: HistoricalCertificate) -> tuple[str, str, tuple[str, ...], str]:
+    return (
+        certificate.subject_cn.lower(),
+        certificate.subject_dn,
+        tuple(certificate.san_entries),
+        certificate.issuer_family,
+    )
+
+

What this block is doing

This block constructs a richer higher-level result from simpler inputs.

+

Flow arrows

Earlier blocks or operator input feed this block. → build_asset_key → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## overlap_metrics + + + + + + +
+
def overlap_metrics(certificates: list[HistoricalCertificate]) -> tuple[int, int]:
+    if len(certificates) < 2:
+        return (0, max(1, len(certificates)))
+    ordered = sorted(
+        certificates,
+        key=lambda item: (
+            item.validity_not_before,
+            item.effective_not_after,
+            item.fingerprint_sha256,
+        ),
+    )
+    max_overlap = 0
+    max_concurrent = 1
+    active: list[HistoricalCertificate] = []
+    for certificate in ordered:
+        active = [item for item in active if item.effective_not_after > certificate.validity_not_before]
+        for other in active:
+            max_overlap = max(max_overlap, overlap_days(other, certificate))
+        active.append(certificate)
+        max_concurrent = max(max_concurrent, len(active))
+    return (max_overlap, max_concurrent)
+
+

What this block is doing

This function is one of the building blocks inside `ct_lineage_report.py`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine.

+

Flow arrows

Earlier blocks or operator input feed this block. → overlap_metrics → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## overlap_row_from_asset + + + + + + +
+
def overlap_row_from_asset(
+    asset_certificates: list[HistoricalCertificate],
+    overlap_days_value: int,
+    max_concurrent: int,
+    details_prefix: str,
+) -> OverlapRow:
+    ordered = sorted(
+        asset_certificates,
+        key=lambda item: (
+            item.validity_not_before,
+            item.effective_not_after,
+            item.fingerprint_sha256,
+        ),
+    )
+    representative = ordered[0]
+    return OverlapRow(
+        subject_cn=representative.subject_cn,
+        asset_variant_count=len(ordered),
+        current_certificate_count=sum(1 for item in ordered if item.current),
+        lineage=representative.issuer_family,
+        max_concurrent=max_concurrent,
+        max_overlap_days=overlap_days_value,
+        overlap_class=overlap_class(overlap_days_value),
+        details=(
+            f"{details_prefix}; "
+            f"DN={representative.subject_dn}; "
+            f"SANs={len(representative.san_entries)}; "
+            f"windows={', '.join(f'{item.validity_not_before.date().isoformat()}->{item.effective_not_after.date().isoformat()}' for item in ordered[:4])}"
+            + ("" if len(ordered) <= 4 else f", ... (+{len(ordered) - 4} more)")
+        ),
+    )
+
+

What this block is doing

This function is one of the building blocks inside `ct_lineage_report.py`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine.

+

Flow arrows

Earlier blocks or operator input feed this block. → overlap_row_from_asset → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## overlap_rows + + + + + + +
+
def overlap_rows(cn_groups: dict[str, list[HistoricalCertificate]]) -> tuple[list[OverlapRow], list[OverlapRow], int, int]:
+    normal_reissuance = 0
+    repeated_asset_count = 0
+    current_red_flags: list[OverlapRow] = []
+    past_red_flags: list[OverlapRow] = []
+    for certificates in cn_groups.values():
+        by_asset: dict[tuple[str, str, tuple[str, ...], str], list[HistoricalCertificate]] = defaultdict(list)
+        for certificate in certificates:
+            by_asset[build_asset_key(certificate)].append(certificate)
+        for asset_certificates in by_asset.values():
+            if len(asset_certificates) < 2:
+                continue
+            repeated_asset_count += 1
+            max_overlap, max_concurrent = overlap_metrics(asset_certificates)
+            current_certificates = [item for item in asset_certificates if item.current]
+            current_overlap, current_concurrent = overlap_metrics(current_certificates)
+            if max_overlap < 50:
+                normal_reissuance += 1
+                continue
+            if current_overlap >= 50:
+                current_red_flags.append(
+                    overlap_row_from_asset(
+                        current_certificates,
+                        current_overlap,
+                        current_concurrent,
+                        f"current overlap persists; historical max overlap={max_overlap} days",
+                    )
+                )
+                continue
+            past_red_flags.append(
+                overlap_row_from_asset(
+                    asset_certificates,
+                    max_overlap,
+                    max_concurrent,
+                    "historical overlap reached red-flag territory, but no currently valid pair still does",
+                )
+            )
+    ordering = lambda item: (-item.max_overlap_days, -item.max_concurrent, -item.asset_variant_count, item.subject_cn.casefold())
+    return (
+        sorted(current_red_flags, key=ordering),
+        sorted(past_red_flags, key=ordering),
+        normal_reissuance,
+        repeated_asset_count,
+    )
+
+

What this block is doing

Finds predecessor/successor pairs that overlap too long.

+

Flow arrows

CN-grouped historical certificates. → overlap_rows → `build_assessment` turns these into current and past overlap red flags.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## build_red_flag_rows + + + + + + +
+
def build_red_flag_rows(
+    cn_groups: dict[str, list[HistoricalCertificate]],
+    dn_rows: list[CnCollisionRow],
+    vendor_rows: list[CnCollisionRow],
+    san_rows: list[SanChangeRow],
+    overlap_rows_: list[OverlapRow],
+) -> list[RedFlagRow]:
+    dn_set = {row.subject_cn.lower() for row in dn_rows}
+    vendor_set = {row.subject_cn.lower() for row in vendor_rows}
+    san_set = {row.subject_cn.lower() for row in san_rows}
+    overlap_set = {row.subject_cn.lower() for row in overlap_rows_}
+    rows: list[RedFlagRow] = []
+    for key, certificates in cn_groups.items():
+        flags: list[str] = []
+        if key in overlap_set:
+            flags.append("overlap >=50 days")
+        if key in dn_set:
+            flags.append("Subject DN drift")
+        if key in vendor_set:
+            flags.append("CA lineage drift")
+        if key in san_set:
+            flags.append("SAN drift")
+        if not flags:
+            continue
+        issuer_mix = Counter(item.issuer_family for item in certificates)
+        notes = ", ".join(f"{name} ({count})" for name, count in issuer_mix.most_common())
+        rows.append(
+            RedFlagRow(
+                subject_cn=min({item.subject_cn for item in certificates}, key=str.casefold),
+                score=len(flags),
+                certificate_count=len(certificates),
+                current_certificate_count=sum(1 for item in certificates if item.current),
+                flags=", ".join(flags),
+                notes=notes,
+            )
+        )
+    rows.sort(key=lambda item: (-item.score, -item.certificate_count, item.subject_cn.casefold()))
+    return rows
+
+

What this block is doing

This block constructs a richer higher-level result from simpler inputs.

+

Flow arrows

Earlier blocks or operator input feed this block. → build_red_flag_rows → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## top_start_days + + + + + + +
+
def top_start_days(certificates: list[HistoricalCertificate], limit: int = 12) -> list[StartDayRow]:
+    by_day: dict[date, list[HistoricalCertificate]] = defaultdict(list)
+    for certificate in certificates:
+        by_day[certificate.validity_not_before.date()].append(certificate)
+    rows: list[StartDayRow] = []
+    for start_day, day_items in sorted(by_day.items(), key=lambda item: (-len(item[1]), item[0])):
+        subject_counts = Counter(item.subject_cn for item in day_items)
+        issuer_counts = Counter(item.issuer_family for item in day_items)
+        rows.append(
+            StartDayRow(
+                start_day=start_day.isoformat(),
+                certificate_count=len(day_items),
+                top_subjects=", ".join(f"{name} ({count})" for name, count in subject_counts.most_common(4)),
+                top_issuers=", ".join(f"{name} ({count})" for name, count in issuer_counts.most_common()),
+            )
+        )
+    return rows[:limit]
+
+

What this block is doing

This function is one of the building blocks inside `ct_lineage_report.py`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine.

+

Flow arrows

Earlier blocks or operator input feed this block. → top_start_days → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## spike_weeks + + + + + + +
+
def spike_weeks(certificates: list[HistoricalCertificate], min_count: int = 8) -> list[StepWeekRow]:
+    by_week: dict[date, list[HistoricalCertificate]] = defaultdict(list)
+    for certificate in certificates:
+        start_day = certificate.validity_not_before.date()
+        week_start = start_day - timedelta(days=start_day.weekday())
+        by_week[week_start].append(certificate)
+    ordered_weeks = sorted(by_week)
+    counts = [len(by_week[week]) for week in ordered_weeks]
+    rows: list[StepWeekRow] = []
+    for index, week in enumerate(ordered_weeks):
+        current_count = counts[index]
+        prior = counts[max(0, index - 8):index]
+        if len(prior) < 4:
+            continue
+        prior_avg = sum(prior) / len(prior)
+        if current_count < min_count:
+            continue
+        if current_count < prior_avg * 2 and current_count < prior_avg + 8:
+            continue
+        week_items = by_week[week]
+        subject_counts = Counter(item.subject_cn for item in week_items)
+        issuer_counts = Counter(item.issuer_family for item in week_items)
+        rows.append(
+            StepWeekRow(
+                week_start=week.isoformat(),
+                certificate_count=current_count,
+                prior_eight_week_avg=f"{prior_avg:.1f}",
+                top_subjects=", ".join(f"{name} ({count})" for name, count in subject_counts.most_common(4)),
+                top_issuers=", ".join(f"{name} ({count})" for name, count in issuer_counts.most_common()),
+            )
+        )
+    return rows
+
+

What this block is doing

This function is one of the building blocks inside `ct_lineage_report.py`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine.

+

Flow arrows

Earlier blocks or operator input feed this block. → spike_weeks → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## partition_collision_rows + + + + + + +
+
def partition_collision_rows(
+    rows: list[CnCollisionRow],
+    cn_groups: dict[str, list[HistoricalCertificate]],
+    value_getter,
+) -> tuple[list[CnCollisionRow], list[CnCollisionRow]]:
+    current_rows: list[CnCollisionRow] = []
+    past_rows: list[CnCollisionRow] = []
+    for row in rows:
+        certificates = cn_groups[row.subject_cn.lower()]
+        current_values = {value_getter(item) for item in certificates if item.current}
+        if len(current_values) > 1:
+            current_rows.append(row)
+        else:
+            past_rows.append(row)
+    return current_rows, past_rows
+
+

What this block is doing

This function is one of the building blocks inside `ct_lineage_report.py`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine.

+

Flow arrows

Earlier blocks or operator input feed this block. → partition_collision_rows → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## partition_san_rows + + + + + + +
+
def partition_san_rows(
+    rows: list[SanChangeRow],
+    cn_groups: dict[str, list[HistoricalCertificate]],
+) -> tuple[list[SanChangeRow], list[SanChangeRow]]:
+    current_rows: list[SanChangeRow] = []
+    past_rows: list[SanChangeRow] = []
+    for row in rows:
+        certificates = cn_groups[row.subject_cn.lower()]
+        current_profiles = {tuple(item.san_entries) for item in certificates if item.current}
+        if len(current_profiles) > 1:
+            current_rows.append(row)
+        else:
+            past_rows.append(row)
+    return current_rows, past_rows
+
+

What this block is doing

This function is one of the building blocks inside `ct_lineage_report.py`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine.

+

Flow arrows

Earlier blocks or operator input feed this block. → partition_san_rows → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## build_assessment + + + + + + +
+
def build_assessment(args: argparse.Namespace) -> HistoricalAssessment:
+    domains, records = load_records(args)
+    certificates = build_certificates(records)
+    cn_groups = group_by_subject_cn(certificates)
+    dn_rows = dn_change_rows(cn_groups)
+    issuer_rows, vendor_rows = issuer_change_rows(cn_groups)
+    san_rows, san_pattern_counts = san_change_rows(cn_groups)
+    overlap_current_rows, overlap_past_rows, normal_reissuance_assets, repeated_asset_count = overlap_rows(cn_groups)
+    dn_current_rows, dn_past_rows = partition_collision_rows(dn_rows, cn_groups, lambda item: item.subject_dn)
+    vendor_current_rows, vendor_past_rows = partition_collision_rows(vendor_rows, cn_groups, lambda item: item.issuer_family)
+    san_current_rows, san_past_rows = partition_san_rows(san_rows, cn_groups)
+    current_red_flag_rows = build_red_flag_rows(
+        cn_groups,
+        dn_current_rows,
+        vendor_current_rows,
+        san_current_rows,
+        overlap_current_rows,
+    )
+    past_red_flag_rows = build_red_flag_rows(
+        cn_groups,
+        dn_past_rows,
+        vendor_past_rows,
+        san_past_rows,
+        overlap_past_rows,
+    )
+    day_rows = top_start_days(certificates)
+    week_rows = spike_weeks(certificates)
+    return HistoricalAssessment(
+        domains=domains,
+        certificates=certificates,
+        cn_groups=cn_groups,
+        dn_rows=dn_rows,
+        dn_current_rows=dn_current_rows,
+        dn_past_rows=dn_past_rows,
+        issuer_rows=issuer_rows,
+        vendor_rows=vendor_rows,
+        vendor_current_rows=vendor_current_rows,
+        vendor_past_rows=vendor_past_rows,
+        san_rows=san_rows,
+        san_current_rows=san_current_rows,
+        san_past_rows=san_past_rows,
+        san_pattern_counts=san_pattern_counts,
+        overlap_current_rows=overlap_current_rows,
+        overlap_past_rows=overlap_past_rows,
+        normal_reissuance_assets=normal_reissuance_assets,
+        repeated_asset_count=repeated_asset_count,
+        current_red_flag_rows=current_red_flag_rows,
+        past_red_flag_rows=past_red_flag_rows,
+        day_rows=day_rows,
+        week_rows=week_rows,
+    )
+
+

What this block is doing

Runs the full historical workflow and returns the finished analytical bundle.

+

Flow arrows

Historical records from all configured domains. → build_assessment → The monograph and standalone historical reports consume this one big bundle.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## render_markdown + + + + + + +
+
def render_markdown(args: argparse.Namespace, assessment: HistoricalAssessment) -> None:
+    args.markdown_output.parent.mkdir(parents=True, exist_ok=True)
+    certificates = assessment.certificates
+    current_count = sum(1 for item in certificates if item.current)
+    cn_groups = assessment.cn_groups
+    repeated_cn_count = sum(1 for values in cn_groups.values() if len(values) > 1)
+    same_cn_same_dn = sum(1 for values in cn_groups.values() if len(values) > 1 and len({item.subject_dn for item in values}) == 1)
+
+    lines: list[str] = []
+    lines.append("# Historical Certificate Lineage Analysis")
+    lines.append("")
+    lines.append(f"Generated: {ct_scan.utc_iso(datetime.now(UTC))}")
+    lines.append(f"Configured search terms file: `{args.domains_file.name}`")
+    lines.append("")
+    lines.append("## Executive Summary")
+    lines.append("")
+    lines.extend(
+        [
+            f"- Historical unique leaf certificates in scope: **{len(certificates)}**.",
+            f"- Currently valid subset inside that historical corpus: **{current_count}**.",
+            f"- Distinct Subject CN values: **{len(cn_groups)}**.",
+            f"- Subject CNs with more than one certificate over time: **{repeated_cn_count}**.",
+            f"- Renewal asset lineages with only normal rollover overlap (`<50 days`): **{assessment.normal_reissuance_assets}**.",
+            f"- Renewal asset lineages with a current overlap red flag (`>=50 days`): **{len(assessment.overlap_current_rows)}**.",
+            f"- Renewal asset lineages with a past-only overlap red flag now fixed: **{len(assessment.overlap_past_rows)}**.",
+            f"- Subject CN values with current red flags: **{len(assessment.current_red_flag_rows)}**.",
+            f"- Subject CN values with past-only red flags now fixed: **{len(assessment.past_red_flag_rows)}**.",
+        ]
+    )
+    lines.append("")
+    lines.append("This report treats Subject CN as a hostname label, not as a unique asset key. The point is to follow certificate lineage through renewals, issuer changes, SAN changes, and issuance bursts across both current and expired certificates, while separating normal rollover from red-flag behavior.")
+    lines.append("")
+    lines.append("## Reading Notes")
+    lines.append("")
+    lines.extend(
+        [
+            "- **Subject CN** is the hostname placed in the certificate's Common Name field.",
+            "- **Subject DN** is the full subject identity string, not just the hostname.",
+            "- **SAN profile** means the complete set of SAN entries carried by a certificate.",
+            "- **CA lineage** collapses exact issuer names into vendor-level families. In this report, legacy COMODO and Sectigo are treated as one lineage: `Sectigo/COMODO`.",
+            "- A **renewal asset lineage** means the same Subject CN, same Subject DN, same SAN profile, and same CA lineage reissued over time.",
+            "- Overlap threshold used here: anything `<50 days` is treated as normal rollover; anything `>=50 days` is treated as a red flag.",
+            "- A **past-only** red flag means the issue is visible historically, but no currently valid certificate still carries that same red-flag condition.",
+            "- A **current** red flag means at least one currently valid certificate still participates in that same red-flag condition.",
+        ]
+    )
+    lines.append("")
+    lines.append("## Chapter 1: Renewal Baseline Versus Overlap Red Flags")
+    lines.append("")
+    lines.append("**Management Summary**")
+    lines.append("")
+    lines.extend(
+        [
+            f"- {repeated_cn_count} of {len(cn_groups)} Subject CN values have more than one certificate across the historical corpus.",
+            f"- {assessment.repeated_asset_count} renewal asset lineages contain more than one certificate.",
+            f"- {assessment.normal_reissuance_assets} of those renewal asset lineages stay below the 50-day overlap threshold and fit the normal renewal model.",
+            f"- {len(assessment.overlap_current_rows)} renewal asset lineages still have a current overlap red flag.",
+            f"- {len(assessment.overlap_past_rows)} renewal asset lineages had an overlap red flag historically, but that issue is not current anymore.",
+            f"- {same_cn_same_dn} repeated Subject CN values keep the same Subject DN while rotating serial number, validity span, or SAN profile.",
+        ]
+    )
+    lines.append("")
+    lines.append("This is the baseline that matters before any anomaly analysis. Most service names are not single certificates frozen in time. They are lineages of certificates issued, renewed, and sometimes restructured under the same public hostname. The key distinction is whether successor and predecessor overlap only briefly, which is normal, or coexist for fifty days or longer, which is the threshold treated here as a red flag.")
+    lines.append("")
+    if assessment.overlap_current_rows:
+        lines.append("### Current Overlap Red Flags")
+        lines.append("")
+        lines.extend(
+            md_table(
+                ["Subject CN", "Lineage", "Asset Certs", "Current", "Max Concurrent", "Max Overlap Days", "Class", "Asset Details"],
+                [
+                    [
+                        row.subject_cn,
+                        row.lineage,
+                        str(row.asset_variant_count),
+                        str(row.current_certificate_count),
+                        str(row.max_concurrent),
+                        str(row.max_overlap_days),
+                        row.overlap_class,
+                        row.details,
+                    ]
+                    for row in assessment.overlap_current_rows[:20]
+                ],
+            )
+        )
+        lines.append("")
+    if assessment.overlap_past_rows:
+        lines.append("### Past Overlap Red Flags Now Fixed")
+        lines.append("")
+        lines.extend(
+            md_table(
+                ["Subject CN", "Lineage", "Asset Certs", "Current", "Max Concurrent", "Max Overlap Days", "Class", "Asset Details"],
+                [
+                    [
+                        row.subject_cn,
+                        row.lineage,
+                        str(row.asset_variant_count),
+                        str(row.current_certificate_count),
+                        str(row.max_concurrent),
+                        str(row.max_overlap_days),
+                        row.overlap_class,
+                        row.details,
+                    ]
+                    for row in assessment.overlap_past_rows[:20]
+                ],
+            )
+        )
+        lines.append("")
+    lines.append("## Chapter 2: Current Red Flags")
+    lines.append("")
+    lines.append("**Management Summary**")
+    lines.append("")
+    lines.extend(
+        [
+            f"- Current overlap red flags: {len(assessment.overlap_current_rows)} Subject-CN asset lineages.",
+            f"- Current Subject DN drift: {len(assessment.dn_current_rows)} Subject CN values.",
+            f"- Current CA lineage drift: {len(assessment.vendor_current_rows)} Subject CN values.",
+            f"- Current SAN drift: {len(assessment.san_current_rows)} Subject CN values.",
+            "- This chapter is the shortest route to the names that deserve present-tense manual review.",
+        ]
+    )
+    lines.append("")
+    if assessment.current_red_flag_rows:
+        lines.extend(
+            md_table(
+                ["Subject CN", "Score", "Certs", "Current", "Flags", "Issuer Mix"],
+                [
+                    [
+                        row.subject_cn,
+                        str(row.score),
+                        str(row.certificate_count),
+                        str(row.current_certificate_count),
+                        row.flags,
+                        row.notes,
+                    ]
+                    for row in assessment.current_red_flag_rows[:30]
+                ],
+            )
+        )
+        lines.append("")
+    else:
+        lines.append("No current red flags were found under the configured rules.")
+        lines.append("")
+    lines.append("## Chapter 3: Past Red Flags Now Fixed")
+    lines.append("")
+    lines.append("**Management Summary**")
+    lines.append("")
+    lines.extend(
+        [
+            f"- Past-only overlap red flags now fixed: {len(assessment.overlap_past_rows)} Subject-CN asset lineages.",
+            f"- Past-only Subject DN drift now fixed: {len(assessment.dn_past_rows)} Subject CN values.",
+            f"- Past-only CA lineage drift now fixed: {len(assessment.vendor_past_rows)} Subject CN values.",
+            f"- Past-only SAN drift now fixed: {len(assessment.san_past_rows)} Subject CN values.",
+            "- These are not present-tense problems, but they matter because they show how the estate used to behave.",
+        ]
+    )
+    lines.append("")
+    if assessment.past_red_flag_rows:
+        lines.extend(
+            md_table(
+                ["Subject CN", "Score", "Certs", "Current", "Flags", "Issuer Mix"],
+                [
+                    [
+                        row.subject_cn,
+                        str(row.score),
+                        str(row.certificate_count),
+                        str(row.current_certificate_count),
+                        row.flags,
+                        row.notes,
+                    ]
+                    for row in assessment.past_red_flag_rows[:30]
+                ],
+            )
+        )
+        lines.append("")
+    else:
+        lines.append("No historical red flags were found under the configured rules.")
+        lines.append("")
+    lines.append("## Chapter 4: Subject DN Drift")
+    lines.append("")
+    lines.append("**Management Summary**")
+    lines.append("")
+    lines.extend(
+        [
+            f"- Current Subject DN drift: {len(assessment.dn_current_rows)}.",
+            f"- Past-only Subject DN drift now fixed: {len(assessment.dn_past_rows)}.",
+            f"- Total Subject CN values with more than one Subject DN across history: {len(assessment.dn_rows)}.",
+            "- This is relevant because it means the hostname stayed the same while the full subject identity string changed.",
+            "- That does not automatically imply a security problem, but it is exactly the kind of drift that deserves review when you care about ownership, issuance policy, or certificate governance.",
+        ]
+    )
+    lines.append("")
+    if assessment.dn_current_rows:
+        lines.append("### Current Subject DN Drift")
+        lines.append("")
+        lines.extend(
+            md_table(
+                ["Subject CN", "Certs", "Current", "Distinct Subject DNs", "Issuer Families", "Subject DN Samples"],
+                [
+                    [
+                        row.subject_cn,
+                        str(row.certificate_count),
+                        str(row.current_certificate_count),
+                        str(row.distinct_value_count),
+                        row.issuer_families,
+                        row.details,
+                    ]
+                    for row in assessment.dn_current_rows[:20]
+                ],
+            )
+        )
+        lines.append("")
+    if assessment.dn_past_rows:
+        lines.append("### Past Subject DN Drift Now Fixed")
+        lines.append("")
+        lines.extend(
+            md_table(
+                ["Subject CN", "Certs", "Current", "Distinct Subject DNs", "Issuer Families", "Subject DN Samples"],
+                [
+                    [
+                        row.subject_cn,
+                        str(row.certificate_count),
+                        str(row.current_certificate_count),
+                        str(row.distinct_value_count),
+                        row.issuer_families,
+                        row.details,
+                    ]
+                    for row in assessment.dn_past_rows[:20]
+                ],
+            )
+        )
+        lines.append("")
+    if not assessment.dn_rows:
+        lines.append("No cases were found.")
+        lines.append("")
+    lines.append("## Chapter 5: CA Lineage Drift")
+    lines.append("")
+    lines.append("**Management Summary**")
+    lines.append("")
+    lines.extend(
+        [
+            f"- Exact issuer-name changes across history: {len(assessment.issuer_rows)} Subject CN values.",
+            f"- Current CA lineage drift: {len(assessment.vendor_current_rows)} Subject CN values.",
+            f"- Past-only CA lineage drift now fixed: {len(assessment.vendor_past_rows)} Subject CN values.",
+            "- Exact issuer changes inside one lineage can be operationally normal. The stronger red flag is a drift between different CA lineages, with COMODO and Sectigo deliberately collapsed into one lineage here.",
+        ]
+    )
+    lines.append("")
+    if assessment.vendor_current_rows:
+        lines.append("### Current CA Lineage Drift")
+        lines.append("")
+        lines.extend(
+            md_table(
+                ["Subject CN", "Certs", "Current", "Distinct Lineages", "Lineage Mix", "Lineages Seen"],
+                [
+                    [
+                        row.subject_cn,
+                        str(row.certificate_count),
+                        str(row.current_certificate_count),
+                        str(row.distinct_value_count),
+                        row.issuer_families,
+                        row.details,
+                    ]
+                    for row in assessment.vendor_current_rows[:20]
+                ],
+            )
+        )
+        lines.append("")
+    if assessment.vendor_past_rows:
+        lines.append("### Past CA Lineage Drift Now Fixed")
+        lines.append("")
+        lines.extend(
+            md_table(
+                ["Subject CN", "Certs", "Current", "Distinct Lineages", "Lineage Mix", "Lineages Seen"],
+                [
+                    [
+                        row.subject_cn,
+                        str(row.certificate_count),
+                        str(row.current_certificate_count),
+                        str(row.distinct_value_count),
+                        row.issuer_families,
+                        row.details,
+                    ]
+                    for row in assessment.vendor_past_rows[:20]
+                ],
+            )
+        )
+        lines.append("")
+    if assessment.issuer_rows:
+        lines.append("### Exact Issuer Changes Inside The Same Or Different Lineages")
+        lines.append("")
+        lines.extend(
+            md_table(
+                ["Subject CN", "Certs", "Current", "Distinct Issuers", "Lineage Mix", "Issuer Samples"],
+                [
+                    [
+                        row.subject_cn,
+                        str(row.certificate_count),
+                        str(row.current_certificate_count),
+                        str(row.distinct_value_count),
+                        row.issuer_families,
+                        row.details,
+                    ]
+                    for row in assessment.issuer_rows[:20]
+                ],
+            )
+        )
+        lines.append("")
+    lines.append("## Chapter 6: SAN Profile Drift")
+    lines.append("")
+    lines.append("**Management Summary**")
+    lines.append("")
+    lines.extend(
+        [
+            f"- Current SAN drift: {len(assessment.san_current_rows)} Subject CN values.",
+            f"- Past-only SAN drift now fixed: {len(assessment.san_past_rows)} Subject CN values.",
+            f"- Total Subject CN values with more than one distinct SAN profile across history: {len(assessment.san_rows)}.",
+            f"- Top SAN-delta pattern classes: {', '.join(f'{name} ({count})' for name, count in assessment.san_pattern_counts.most_common()) or 'none'}.",
+            "- This shows whether the service name stayed stable while the covered endpoint set expanded, contracted, or shifted shape.",
+        ]
+    )
+    lines.append("")
+    if assessment.san_current_rows:
+        lines.append("### Current SAN Drift")
+        lines.append("")
+        lines.extend(
+            md_table(
+                [
+                    "Subject CN",
+                    "Certs",
+                    "Current",
+                    "SAN Profiles",
+                    "Stable SANs",
+                    "Variable SANs",
+                    "Delta Pattern",
+                    "Representative Delta",
+                ],
+                [
+                    [
+                        row.subject_cn,
+                        str(row.certificate_count),
+                        str(row.current_certificate_count),
+                        str(row.distinct_san_profiles),
+                        str(row.stable_entries),
+                        str(row.variable_entries),
+                        row.delta_pattern,
+                        row.representative_delta,
+                    ]
+                    for row in assessment.san_current_rows[:20]
+                ],
+            )
+        )
+        lines.append("")
+    if assessment.san_past_rows:
+        lines.append("### Past SAN Drift Now Fixed")
+        lines.append("")
+        lines.extend(
+            md_table(
+                [
+                    "Subject CN",
+                    "Certs",
+                    "Current",
+                    "SAN Profiles",
+                    "Stable SANs",
+                    "Variable SANs",
+                    "Delta Pattern",
+                    "Representative Delta",
+                ],
+                [
+                    [
+                        row.subject_cn,
+                        str(row.certificate_count),
+                        str(row.current_certificate_count),
+                        str(row.distinct_san_profiles),
+                        str(row.stable_entries),
+                        str(row.variable_entries),
+                        row.delta_pattern,
+                        row.representative_delta,
+                    ]
+                    for row in assessment.san_past_rows[:20]
+                ],
+            )
+        )
+        lines.append("")
+    lines.append("## Chapter 7: Historic Issuance Bursts And Step Changes")
+    lines.append("")
+    lines.append("**Management Summary**")
+    lines.append("")
+    lines.extend(
+        [
+            "- This chapter includes expired certificates on purpose, because step changes are historical phenomena rather than current-only phenomena.",
+            "- Strong same-day or same-week issuance bursts usually signal planned renewal waves, platform migrations, or bulk onboarding of service families.",
+            f"- Top issuance start dates: {', '.join(f'{row.start_day} ({row.certificate_count})' for row in assessment.day_rows[:6])}.",
+        ]
+    )
+    lines.append("")
+    lines.append("### Top Start Dates")
+    lines.append("")
+    lines.extend(
+        md_table(
+            ["Start Day", "Certificates", "Top Subject CNs", "Top Issuer Families"],
+            [[row.start_day, str(row.certificate_count), row.top_subjects, row.top_issuers] for row in assessment.day_rows],
+        )
+    )
+    lines.append("")
+    lines.append("### Step Weeks")
+    lines.append("")
+    if assessment.week_rows:
+        lines.extend(
+            md_table(
+                ["Week Start", "Certificates", "Prior 8-Week Avg", "Top Subject CNs", "Top Issuer Families"],
+                [
+                    [
+                        row.week_start,
+                        str(row.certificate_count),
+                        row.prior_eight_week_avg,
+                        row.top_subjects,
+                        row.top_issuers,
+                    ]
+                    for row in assessment.week_rows[:20]
+                ],
+            )
+        )
+        lines.append("")
+    else:
+        lines.append("No step weeks met the configured threshold.")
+        lines.append("")
+    lines.append("## Chapter 8: Interpretation")
+    lines.append("")
+    lines.append("The main operational picture is not one of single certificates mapped one-to-one to service names. It is a layered certificate lineage model. The normal case is rollover inside a stable renewal asset lineage with less than fifty days of overlap. The red flags are the exceptions layered on top of that baseline: overlap that persists for fifty days or more, Subject DN drift, CA lineage drift, and SAN drift. The current-versus-past split matters because it distinguishes live governance concerns from issues that appear to have been corrected already.")
+    lines.append("")
+    args.markdown_output.write_text("\n".join(lines) + "\n", encoding="utf-8")
+
+

What this block is doing

Writes the standalone historical report in Markdown.

+

Flow arrows

Earlier blocks or operator input feed this block. → render_markdown → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## render_latex + + + + + + +
+
def render_latex(args: argparse.Namespace, assessment: HistoricalAssessment) -> None:
+    args.latex_output.parent.mkdir(parents=True, exist_ok=True)
+    certificates = assessment.certificates
+    current_count = sum(1 for item in certificates if item.current)
+    cn_groups = assessment.cn_groups
+    repeated_cn_count = sum(1 for values in cn_groups.values() if len(values) > 1)
+    same_cn_same_dn = sum(1 for values in cn_groups.values() if len(values) > 1 and len({item.subject_dn for item in values}) == 1)
+
+    lines: list[str] = [
+        r"\documentclass[11pt]{article}",
+        r"\usepackage[a4paper,margin=18mm]{geometry}",
+        r"\usepackage{fontspec}",
+        r"\usepackage[table]{xcolor}",
+        r"\usepackage{microtype}",
+        r"\usepackage{hyperref}",
+        r"\usepackage{xurl}",
+        r"\usepackage{array}",
+        r"\usepackage{booktabs}",
+        r"\usepackage{longtable}",
+        r"\usepackage{enumitem}",
+        r"\usepackage{fancyhdr}",
+        r"\usepackage{titlesec}",
+        r"\usepackage[most]{tcolorbox}",
+        r"\usepackage{pdflscape}",
+        r"\defaultfontfeatures{Ligatures=TeX,Scale=MatchLowercase}",
+        r"\setmainfont{Palatino}",
+        r"\setsansfont{Avenir Next}",
+        r"\setmonofont{Menlo}",
+        r"\definecolor{Ink}{HTML}{17202A}",
+        r"\definecolor{Line}{HTML}{D0D5DD}",
+        r"\definecolor{Panel}{HTML}{F8FAFC}",
+        r"\definecolor{Accent}{HTML}{0F766E}",
+        r"\hypersetup{colorlinks=true,linkcolor=Accent,urlcolor=Accent,pdfauthor={CertTransparencySearch},pdftitle={Historical Certificate Lineage Analysis}}",
+        r"\setlength{\parindent}{0pt}",
+        r"\setlength{\parskip}{6pt}",
+        r"\setlength{\emergencystretch}{4em}",
+        r"\setlength{\headheight}{16pt}",
+        r"\setlength{\tabcolsep}{4.2pt}",
+        r"\renewcommand{\arraystretch}{1.12}",
+        r"\raggedbottom",
+        r"\setcounter{tocdepth}{2}",
+        r"\pagestyle{fancy}",
+        r"\fancyhf{}",
+        r"\renewcommand{\headrulewidth}{0pt}",
+        r"\fancyfoot[C]{\sffamily\footnotesize \thepage}",
+        r"\titleformat{\section}{\sffamily\bfseries\LARGE\color{Ink}\raggedright}{\thesection}{0.8em}{}",
+        r"\titleformat{\subsection}{\sffamily\bfseries\Large\color{Ink}\raggedright}{\thesubsection}{0.8em}{}",
+        r"\tcbset{panel/.style={enhanced,breakable,boxrule=0.55pt,arc=3pt,left=9pt,right=9pt,top=8pt,bottom=8pt,colback=white,colframe=Line}}",
+        r"\newcommand{\SummaryBox}[1]{\begin{tcolorbox}[panel,colback=Panel]#1\end{tcolorbox}}",
+        r"\begin{document}",
+        r"\begin{titlepage}",
+        r"\vspace*{18mm}",
+        r"{\sffamily\bfseries\fontsize{24}{28}\selectfont Historical Certificate Lineage Analysis\par}",
+        r"\vspace{8pt}",
+        r"{\Large A historical study of Subject CN reuse, subject drift, issuer drift, SAN drift, and issuance bursts\par}",
+        r"\vspace{18pt}",
+        rf"\textbf{{Generated}}: {ct_scan.latex_escape(ct_scan.utc_iso(datetime.now(UTC)))}\par",
+        rf"\textbf{{Configured search terms file}}: {ct_scan.latex_escape(args.domains_file.name)}\par",
+        r"\vspace{12pt}",
+        r"\SummaryBox{"
+        + rf"\textbf{{Headline}}: {len(certificates)} historical leaf certificates, {current_count} currently valid, {len(cn_groups)} Subject CN values, {repeated_cn_count} multi-certificate CN lineages."
+        + r"}",
+        r"\end{titlepage}",
+        r"\tableofcontents",
+        r"\clearpage",
+    ]
+
+    def add_summary(items: list[str]) -> None:
+        lines.append(r"\SummaryBox{\textbf{Management Summary}\begin{itemize}[leftmargin=1.4em]")
+        for item in items:
+            lines.append(rf"\item {ct_scan.latex_escape(item)}")
+        lines.append(r"\end{itemize}}")
+
+    lines.append(r"\section{Executive Summary}")
+    add_summary(
+        [
+            f"Historical unique leaf certificates in scope: {len(certificates)}.",
+            f"Currently valid subset inside that historical corpus: {current_count}.",
+            f"Distinct Subject CN values: {len(cn_groups)}.",
+            f"Subject CN values with more than one certificate over time: {repeated_cn_count}.",
+            f"Normal renewal asset lineages with overlap below 50 days: {assessment.normal_reissuance_assets}.",
+            f"Current overlap red flags: {len(assessment.overlap_current_rows)}.",
+            f"Past-only overlap red flags now fixed: {len(assessment.overlap_past_rows)}.",
+        ]
+    )
+    lines.append(
+        r"This report treats Subject CN as a hostname label, not as a unique asset key. The goal is to observe how certificate lineages evolve over time across renewals, issuer changes, SAN changes, and issuance bursts, while separating normal rollover from genuine red flags."
+    )
+
+    lines.append(r"\section{Reading Notes}")
+    lines.append(r"\begin{itemize}[leftmargin=1.4em]")
+    for item in [
+        "Subject CN is the hostname placed in the certificate's Common Name field.",
+        "Subject DN is the full subject identity string, not just the hostname.",
+        "SAN profile means the complete set of SAN entries carried by a certificate.",
+        "CA lineage collapses exact issuer names into vendor-level families. Legacy COMODO and Sectigo are treated as one lineage here: Sectigo/COMODO.",
+        "A renewal asset lineage means the same Subject CN, same Subject DN, same SAN profile, and same CA lineage reissued over time.",
+        "The overlap threshold used here is simple: less than 50 days is normal rollover, 50 days or more is a red flag.",
+        "A past-only red flag means it appears historically but no currently valid certificate still carries that same condition.",
+    ]:
+        lines.append(rf"\item {ct_scan.latex_escape(item)}")
+    lines.append(r"\end{itemize}")
+
+    lines.append(r"\section{Renewal Baseline Versus Overlap Red Flags}")
+    add_summary(
+        [
+            f"{repeated_cn_count} of {len(cn_groups)} Subject CN values have more than one certificate across the historical corpus.",
+            f"{assessment.repeated_asset_count} renewal asset lineages contain more than one certificate.",
+            f"{assessment.normal_reissuance_assets} of those renewal asset lineages stay below the 50-day overlap threshold and fit the normal renewal model.",
+            f"{len(assessment.overlap_current_rows)} still have a current overlap red flag.",
+            f"{len(assessment.overlap_past_rows)} had an overlap red flag historically, but that issue is not current anymore.",
+            f"{same_cn_same_dn} repeated Subject CN values keep the same Subject DN while rotating serial number, validity span, or SAN profile.",
+        ]
+    )
+    lines.append(
+        r"The baseline is ordinary certificate rollover: successor and predecessor overlap briefly while deployment is switched over. The red flag is not reissuance itself, but overlap that persists for fifty days or longer for what otherwise looks like the same renewal asset lineage."
+    )
+    if assessment.overlap_current_rows:
+        lines.append(r"\subsection{Current Overlap Red Flags}")
+        lines.extend(
+            [
+                r"\begin{longtable}{>{\raggedright\arraybackslash}p{0.14\linewidth} >{\raggedright\arraybackslash}p{0.12\linewidth} >{\raggedleft\arraybackslash}p{0.07\linewidth} >{\raggedleft\arraybackslash}p{0.06\linewidth} >{\raggedleft\arraybackslash}p{0.08\linewidth} >{\raggedleft\arraybackslash}p{0.08\linewidth} >{\raggedright\arraybackslash}p{0.13\linewidth} >{\raggedright\arraybackslash}p{0.24\linewidth}}",
+                r"\toprule",
+                r"Subject CN & Lineage & Asset Certs & Current & Max Concurrent & Max Overlap Days & Class & Asset Details \\",
+                r"\midrule",
+            ]
+        )
+        for row in assessment.overlap_current_rows[:20]:
+            lines.append(
+                rf"{ct_scan.latex_escape(row.subject_cn)} & {ct_scan.latex_escape(row.lineage)} & {row.asset_variant_count} & {row.current_certificate_count} & {row.max_concurrent} & {row.max_overlap_days} & {ct_scan.latex_escape(row.overlap_class)} & {ct_scan.latex_escape(row.details)} \\"
+            )
+        lines.extend([r"\bottomrule", r"\end{longtable}"])
+    if assessment.overlap_past_rows:
+        lines.append(r"\subsection{Past Overlap Red Flags Now Fixed}")
+        lines.extend(
+            [
+                r"\begin{longtable}{>{\raggedright\arraybackslash}p{0.14\linewidth} >{\raggedright\arraybackslash}p{0.12\linewidth} >{\raggedleft\arraybackslash}p{0.07\linewidth} >{\raggedleft\arraybackslash}p{0.06\linewidth} >{\raggedleft\arraybackslash}p{0.08\linewidth} >{\raggedleft\arraybackslash}p{0.08\linewidth} >{\raggedright\arraybackslash}p{0.13\linewidth} >{\raggedright\arraybackslash}p{0.24\linewidth}}",
+                r"\toprule",
+                r"Subject CN & Lineage & Asset Certs & Current & Max Concurrent & Max Overlap Days & Class & Asset Details \\",
+                r"\midrule",
+            ]
+        )
+        for row in assessment.overlap_past_rows[:20]:
+            lines.append(
+                rf"{ct_scan.latex_escape(row.subject_cn)} & {ct_scan.latex_escape(row.lineage)} & {row.asset_variant_count} & {row.current_certificate_count} & {row.max_concurrent} & {row.max_overlap_days} & {ct_scan.latex_escape(row.overlap_class)} & {ct_scan.latex_escape(row.details)} \\"
+            )
+        lines.extend([r"\bottomrule", r"\end{longtable}"])
+
+    lines.append(r"\section{Current Red Flags}")
+    add_summary(
+        [
+            f"Current overlap red flags: {len(assessment.overlap_current_rows)} Subject-CN asset lineages.",
+            f"Current Subject DN drift: {len(assessment.dn_current_rows)} Subject CN values.",
+            f"Current CA lineage drift: {len(assessment.vendor_current_rows)} Subject CN values.",
+            f"Current SAN drift: {len(assessment.san_current_rows)} Subject CN values.",
+        ]
+    )
+    if assessment.current_red_flag_rows:
+        lines.extend(
+            [
+                r"\begin{longtable}{>{\raggedright\arraybackslash}p{0.18\linewidth} >{\raggedleft\arraybackslash}p{0.06\linewidth} >{\raggedleft\arraybackslash}p{0.06\linewidth} >{\raggedleft\arraybackslash}p{0.06\linewidth} >{\raggedright\arraybackslash}p{0.30\linewidth} >{\raggedright\arraybackslash}p{0.26\linewidth}}",
+                r"\toprule",
+                r"Subject CN & Score & Certs & Current & Flags & Issuer Mix \\",
+                r"\midrule",
+            ]
+        )
+        for row in assessment.current_red_flag_rows[:30]:
+            lines.append(
+                rf"{ct_scan.latex_escape(row.subject_cn)} & {row.score} & {row.certificate_count} & {row.current_certificate_count} & {ct_scan.latex_escape(row.flags)} & {ct_scan.latex_escape(row.notes)} \\"
+            )
+        lines.extend([r"\bottomrule", r"\end{longtable}"])
+    else:
+        lines.append(r"No current red flags were found under the configured rules.")
+
+    lines.append(r"\section{Past Red Flags Now Fixed}")
+    add_summary(
+        [
+            f"Past-only overlap red flags now fixed: {len(assessment.overlap_past_rows)} Subject-CN asset lineages.",
+            f"Past-only Subject DN drift now fixed: {len(assessment.dn_past_rows)} Subject CN values.",
+            f"Past-only CA lineage drift now fixed: {len(assessment.vendor_past_rows)} Subject CN values.",
+            f"Past-only SAN drift now fixed: {len(assessment.san_past_rows)} Subject CN values.",
+        ]
+    )
+    if assessment.past_red_flag_rows:
+        lines.extend(
+            [
+                r"\begin{longtable}{>{\raggedright\arraybackslash}p{0.18\linewidth} >{\raggedleft\arraybackslash}p{0.06\linewidth} >{\raggedleft\arraybackslash}p{0.06\linewidth} >{\raggedleft\arraybackslash}p{0.06\linewidth} >{\raggedright\arraybackslash}p{0.30\linewidth} >{\raggedright\arraybackslash}p{0.26\linewidth}}",
+                r"\toprule",
+                r"Subject CN & Score & Certs & Current & Flags & Issuer Mix \\",
+                r"\midrule",
+            ]
+        )
+        for row in assessment.past_red_flag_rows[:30]:
+            lines.append(
+                rf"{ct_scan.latex_escape(row.subject_cn)} & {row.score} & {row.certificate_count} & {row.current_certificate_count} & {ct_scan.latex_escape(row.flags)} & {ct_scan.latex_escape(row.notes)} \\"
+            )
+        lines.extend([r"\bottomrule", r"\end{longtable}"])
+    else:
+        lines.append(r"No historical red flags were found under the configured rules.")
+
+    lines.append(r"\section{Subject DN Drift}")
+    add_summary(
+        [
+            f"Current Subject DN drift: {len(assessment.dn_current_rows)}.",
+            f"Past-only Subject DN drift now fixed: {len(assessment.dn_past_rows)}.",
+            f"Total Subject CN values with more than one Subject DN across history: {len(assessment.dn_rows)}.",
+            "This matters because the hostname stayed the same while the full subject identity string changed.",
+            "That is not automatically a security problem, but it is relevant governance drift.",
+        ]
+    )
+    if assessment.dn_current_rows:
+        lines.append(r"\subsection{Current Subject DN Drift}")
+        lines.extend(
+            [
+                r"\begin{longtable}{>{\raggedright\arraybackslash}p{0.20\linewidth} >{\raggedleft\arraybackslash}p{0.07\linewidth} >{\raggedleft\arraybackslash}p{0.07\linewidth} >{\raggedleft\arraybackslash}p{0.09\linewidth} >{\raggedright\arraybackslash}p{0.20\linewidth} >{\raggedright\arraybackslash}p{0.29\linewidth}}",
+                r"\toprule",
+                r"Subject CN & Certs & Current & Distinct Subject DNs & Issuer Families & Subject DN Samples \\",
+                r"\midrule",
+            ]
+        )
+        for row in assessment.dn_current_rows[:20]:
+            lines.append(
+                rf"{ct_scan.latex_escape(row.subject_cn)} & {row.certificate_count} & {row.current_certificate_count} & {row.distinct_value_count} & {ct_scan.latex_escape(row.issuer_families)} & {ct_scan.latex_escape(row.details)} \\"
+            )
+        lines.extend([r"\bottomrule", r"\end{longtable}"])
+    if assessment.dn_past_rows:
+        lines.append(r"\subsection{Past Subject DN Drift Now Fixed}")
+        lines.extend(
+            [
+                r"\begin{longtable}{>{\raggedright\arraybackslash}p{0.20\linewidth} >{\raggedleft\arraybackslash}p{0.07\linewidth} >{\raggedleft\arraybackslash}p{0.07\linewidth} >{\raggedleft\arraybackslash}p{0.09\linewidth} >{\raggedright\arraybackslash}p{0.20\linewidth} >{\raggedright\arraybackslash}p{0.29\linewidth}}",
+                r"\toprule",
+                r"Subject CN & Certs & Current & Distinct Subject DNs & Issuer Families & Subject DN Samples \\",
+                r"\midrule",
+            ]
+        )
+        for row in assessment.dn_past_rows[:20]:
+            lines.append(
+                rf"{ct_scan.latex_escape(row.subject_cn)} & {row.certificate_count} & {row.current_certificate_count} & {row.distinct_value_count} & {ct_scan.latex_escape(row.issuer_families)} & {ct_scan.latex_escape(row.details)} \\"
+            )
+        lines.extend([r"\bottomrule", r"\end{longtable}"])
+    if not assessment.dn_rows:
+        lines.append(r"No cases were found.")
+
+    lines.append(r"\section{CA Lineage Drift}")
+    add_summary(
+        [
+            f"Exact issuer-name changes across history: {len(assessment.issuer_rows)} Subject CN values.",
+            f"Current CA lineage drift: {len(assessment.vendor_current_rows)} Subject CN values.",
+            f"Past-only CA lineage drift now fixed: {len(assessment.vendor_past_rows)} Subject CN values.",
+            "Exact issuer changes inside one lineage can be operationally normal. CA lineage drift is the stronger signal, with COMODO and Sectigo deliberately collapsed into one lineage.",
+        ]
+    )
+    if assessment.vendor_current_rows:
+        lines.append(r"\subsection{Current CA Lineage Drift}")
+        lines.extend(
+            [
+                r"\begin{longtable}{>{\raggedright\arraybackslash}p{0.20\linewidth} >{\raggedleft\arraybackslash}p{0.07\linewidth} >{\raggedleft\arraybackslash}p{0.07\linewidth} >{\raggedleft\arraybackslash}p{0.08\linewidth} >{\raggedright\arraybackslash}p{0.18\linewidth} >{\raggedright\arraybackslash}p{0.32\linewidth}}",
+                r"\toprule",
+                r"Subject CN & Certs & Current & Distinct Lineages & Lineage Mix & Lineages Seen \\",
+                r"\midrule",
+            ]
+        )
+        for row in assessment.vendor_current_rows[:20]:
+            lines.append(
+                rf"{ct_scan.latex_escape(row.subject_cn)} & {row.certificate_count} & {row.current_certificate_count} & {row.distinct_value_count} & {ct_scan.latex_escape(row.issuer_families)} & {ct_scan.latex_escape(row.details)} \\"
+            )
+        lines.extend([r"\bottomrule", r"\end{longtable}"])
+    if assessment.vendor_past_rows:
+        lines.append(r"\subsection{Past CA Lineage Drift Now Fixed}")
+        lines.extend(
+            [
+                r"\begin{longtable}{>{\raggedright\arraybackslash}p{0.20\linewidth} >{\raggedleft\arraybackslash}p{0.07\linewidth} >{\raggedleft\arraybackslash}p{0.07\linewidth} >{\raggedleft\arraybackslash}p{0.08\linewidth} >{\raggedright\arraybackslash}p{0.18\linewidth} >{\raggedright\arraybackslash}p{0.32\linewidth}}",
+                r"\toprule",
+                r"Subject CN & Certs & Current & Distinct Lineages & Lineage Mix & Lineages Seen \\",
+                r"\midrule",
+            ]
+        )
+        for row in assessment.vendor_past_rows[:20]:
+            lines.append(
+                rf"{ct_scan.latex_escape(row.subject_cn)} & {row.certificate_count} & {row.current_certificate_count} & {row.distinct_value_count} & {ct_scan.latex_escape(row.issuer_families)} & {ct_scan.latex_escape(row.details)} \\"
+            )
+        lines.extend([r"\bottomrule", r"\end{longtable}"])
+    if assessment.issuer_rows:
+        lines.append(r"\subsection{Exact Issuer Changes Inside The Same Or Different Lineages}")
+        lines.extend(
+            [
+                r"\begin{longtable}{>{\raggedright\arraybackslash}p{0.20\linewidth} >{\raggedleft\arraybackslash}p{0.07\linewidth} >{\raggedleft\arraybackslash}p{0.07\linewidth} >{\raggedleft\arraybackslash}p{0.08\linewidth} >{\raggedright\arraybackslash}p{0.18\linewidth} >{\raggedright\arraybackslash}p{0.32\linewidth}}",
+                r"\toprule",
+                r"Subject CN & Certs & Current & Distinct Issuers & Lineage Mix & Issuer Samples \\",
+                r"\midrule",
+            ]
+        )
+        for row in assessment.issuer_rows[:20]:
+            lines.append(
+                rf"{ct_scan.latex_escape(row.subject_cn)} & {row.certificate_count} & {row.current_certificate_count} & {row.distinct_value_count} & {ct_scan.latex_escape(row.issuer_families)} & {ct_scan.latex_escape(row.details)} \\"
+            )
+        lines.extend([r"\bottomrule", r"\end{longtable}"])
+
+    lines.append(r"\section{SAN Profile Drift}")
+    add_summary(
+        [
+            f"Current SAN drift: {len(assessment.san_current_rows)} Subject CN values.",
+            f"Past-only SAN drift now fixed: {len(assessment.san_past_rows)} Subject CN values.",
+            f"Total Subject CN values with more than one SAN profile across history: {len(assessment.san_rows)}.",
+            f"Top SAN-delta pattern classes: {', '.join(f'{name} ({count})' for name, count in assessment.san_pattern_counts.most_common()) or 'none'}.",
+            "This reveals whether the endpoint surface under the same hostname stayed stable or changed shape over time.",
+        ]
+    )
+    if assessment.san_current_rows:
+        lines.append(r"\subsection{Current SAN Drift}")
+        lines.extend(
+            [
+                r"\begin{longtable}{>{\raggedright\arraybackslash}p{0.16\linewidth} >{\raggedleft\arraybackslash}p{0.06\linewidth} >{\raggedleft\arraybackslash}p{0.06\linewidth} >{\raggedleft\arraybackslash}p{0.07\linewidth} >{\raggedleft\arraybackslash}p{0.07\linewidth} >{\raggedleft\arraybackslash}p{0.07\linewidth} >{\raggedright\arraybackslash}p{0.18\linewidth} >{\raggedright\arraybackslash}p{0.25\linewidth}}",
+                r"\toprule",
+                r"Subject CN & Certs & Current & Profiles & Stable & Variable & Delta Pattern & Representative Delta \\",
+                r"\midrule",
+            ]
+        )
+        for row in assessment.san_current_rows[:20]:
+            lines.append(
+                rf"{ct_scan.latex_escape(row.subject_cn)} & {row.certificate_count} & {row.current_certificate_count} & {row.distinct_san_profiles} & {row.stable_entries} & {row.variable_entries} & {ct_scan.latex_escape(row.delta_pattern)} & {ct_scan.latex_escape(row.representative_delta)} \\"
+            )
+        lines.extend([r"\bottomrule", r"\end{longtable}"])
+    if assessment.san_past_rows:
+        lines.append(r"\subsection{Past SAN Drift Now Fixed}")
+        lines.extend(
+            [
+                r"\begin{longtable}{>{\raggedright\arraybackslash}p{0.16\linewidth} >{\raggedleft\arraybackslash}p{0.06\linewidth} >{\raggedleft\arraybackslash}p{0.06\linewidth} >{\raggedleft\arraybackslash}p{0.07\linewidth} >{\raggedleft\arraybackslash}p{0.07\linewidth} >{\raggedleft\arraybackslash}p{0.07\linewidth} >{\raggedright\arraybackslash}p{0.18\linewidth} >{\raggedright\arraybackslash}p{0.25\linewidth}}",
+                r"\toprule",
+                r"Subject CN & Certs & Current & Profiles & Stable & Variable & Delta Pattern & Representative Delta \\",
+                r"\midrule",
+            ]
+        )
+        for row in assessment.san_past_rows[:20]:
+            lines.append(
+                rf"{ct_scan.latex_escape(row.subject_cn)} & {row.certificate_count} & {row.current_certificate_count} & {row.distinct_san_profiles} & {row.stable_entries} & {row.variable_entries} & {ct_scan.latex_escape(row.delta_pattern)} & {ct_scan.latex_escape(row.representative_delta)} \\"
+            )
+        lines.extend([r"\bottomrule", r"\end{longtable}"])
+
+    lines.append(r"\section{Historic Issuance Bursts And Step Changes}")
+    add_summary(
+        [
+            "This chapter includes expired certificates on purpose, because issuance bursts are historical phenomena rather than current-only phenomena.",
+            f"Top issuance start dates are {', '.join(f'{row.start_day} ({row.certificate_count})' for row in assessment.day_rows[:6])}.",
+            "Strong same-day or same-week bursts usually indicate planned renewal waves, platform migrations, or bulk onboarding of service families.",
+        ]
+    )
+    lines.append(r"\subsection{Top Start Dates}")
+    lines.extend(
+        [
+            r"\begin{longtable}{>{\raggedright\arraybackslash}p{0.13\linewidth} >{\raggedleft\arraybackslash}p{0.09\linewidth} >{\raggedright\arraybackslash}p{0.43\linewidth} >{\raggedright\arraybackslash}p{0.27\linewidth}}",
+            r"\toprule",
+            r"Start Day & Certificates & Top Subject CNs & Top Issuer Families \\",
+            r"\midrule",
+        ]
+    )
+    for row in assessment.day_rows:
+        lines.append(
+            rf"{ct_scan.latex_escape(row.start_day)} & {row.certificate_count} & {ct_scan.latex_escape(row.top_subjects)} & {ct_scan.latex_escape(row.top_issuers)} \\"
+        )
+    lines.extend([r"\bottomrule", r"\end{longtable}"])
+    lines.append(r"\subsection{Step Weeks}")
+    if assessment.week_rows:
+        lines.extend(
+            [
+                r"\begin{longtable}{>{\raggedright\arraybackslash}p{0.13\linewidth} >{\raggedleft\arraybackslash}p{0.08\linewidth} >{\raggedleft\arraybackslash}p{0.10\linewidth} >{\raggedright\arraybackslash}p{0.35\linewidth} >{\raggedright\arraybackslash}p{0.24\linewidth}}",
+                r"\toprule",
+                r"Week Start & Certs & Prior 8-Week Avg & Top Subject CNs & Top Issuer Families \\",
+                r"\midrule",
+            ]
+        )
+        for row in assessment.week_rows[:20]:
+            lines.append(
+                rf"{ct_scan.latex_escape(row.week_start)} & {row.certificate_count} & {ct_scan.latex_escape(row.prior_eight_week_avg)} & {ct_scan.latex_escape(row.top_subjects)} & {ct_scan.latex_escape(row.top_issuers)} \\"
+            )
+        lines.extend([r"\bottomrule", r"\end{longtable}"])
+    else:
+        lines.append(r"No step weeks met the configured threshold.")
+
+    lines.append(r"\section{Interpretation}")
+    lines.append(
+        r"The public certificate view is not just a static inventory. It is a change log. The normal case is rollover inside a stable renewal asset lineage with less than fifty days of overlap. The red flags are the exceptions layered on top of that baseline: overlap of fifty days or more, Subject DN drift, CA lineage drift, and SAN drift. The current-versus-past split matters because it separates live governance concerns from issues that appear to have been corrected already."
+    )
+    lines.extend([r"\end{document}"])
+    args.latex_output.write_text("\n".join(lines) + "\n", encoding="utf-8")
+
+

What this block is doing

Writes the standalone historical report in LaTeX.

+

Flow arrows

Earlier blocks or operator input feed this block. → render_latex → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## main + + + + + + +
+
def main() -> int:
+    args = parse_args()
+    assessment = build_assessment(args)
+    render_markdown(args, assessment)
+    render_latex(args, assessment)
+    if not args.skip_pdf:
+        ct_scan.compile_latex_to_pdf(args.latex_output, args.pdf_output, args.pdf_engine)
+    if not args.quiet:
+        print(
+            f"[report] historical_leaf={len(assessment.certificates)} markdown={args.markdown_output} latex={args.latex_output}"
+            + ("" if args.skip_pdf else f" pdf={args.pdf_output}"),
+            file=__import__("sys").stderr,
+        )
+    return 0
+
+

What this block is doing

The standalone command-line entrypoint for the historical analyzer.

+

Flow arrows

CLI arguments from the operator. → main → Runs the standalone historical analysis end to end.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ diff --git a/teachingNoobs/ct_master_report.md b/teachingNoobs/ct_master_report.md new file mode 100644 index 0000000..b425f66 --- /dev/null +++ b/teachingNoobs/ct_master_report.md @@ -0,0 +1,1170 @@ +# ct_master_report.py + +Source file: [`ct_master_report.py`](../ct_master_report.py) + +Current-state synthesizer. This file combines certificate facts, DNS facts, purpose classification, grouping, and curated examples into one report bundle. + +Main flow in one line: `current CT facts + DNS facts + usage facts -> one current-state report bundle` + +How to read this page: + +- left side: the actual source code block +- right side: a plain-English explanation for a beginner +- read from top to bottom because later blocks depend on earlier ones + +## Module setup + + + + + + +
+
#!/usr/bin/env python3
+
+from __future__ import annotations
+
+import argparse
+from collections import Counter, defaultdict
+from dataclasses import dataclass
+from datetime import UTC, datetime
+from pathlib import Path
+
+import ct_dns_utils
+import ct_scan
+import ct_usage_assessment
+
+
+ENV_TOKENS = [
+    "api",
+    "auth",
+    "developer",
+    "webbanking",
+    "sandbox",
+    "dev",
+    "test",
+    "qa",
+    "uat",
+    "preprod",
+    "prod",
+    "stage",
+    "stg",
+    "release",
+    "replica",
+    "support",
+    "hotfix",
+    "monitoring",
+    "mail",
+    "statement",
+    "update",
+    "secure",
+]
+
+

What this block is doing

Current-state report assembly code that sits above the low-level scanners.

+

Flow arrows

Earlier blocks or operator input feed this block. → Module setup → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## ExampleBlock + + + + + + +
+
@dataclass
+class ExampleBlock:
+    title: str
+    subject_cn: str
+    why_it_matters: str
+    evidence: list[str]
+
+

What this block is doing

A small narrative evidence block used in the naming chapter.

+

Flow arrows

Earlier blocks or operator input feed this block. → ExampleBlock → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## parse_args + + + + + + +
+
def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Generate a single consolidated CT, DNS, and naming report."
+    )
+    parser.add_argument("--domains-file", type=Path, default=Path("domains.local.txt"))
+    parser.add_argument("--cache-dir", type=Path, default=Path(".cache/ct-search"))
+    parser.add_argument("--dns-cache-dir", type=Path, default=Path(".cache/dns-scan"))
+    parser.add_argument("--cache-ttl-seconds", type=int, default=0)
+    parser.add_argument("--dns-cache-ttl-seconds", type=int, default=86400)
+    parser.add_argument("--max-candidates-per-domain", type=int, default=10000)
+    parser.add_argument("--retries", type=int, default=3)
+    parser.add_argument("--markdown-output", type=Path, default=Path("output/consolidated-corpus-report.md"))
+    parser.add_argument("--latex-output", type=Path, default=Path("output/consolidated-corpus-report.tex"))
+    parser.add_argument("--pdf-output", type=Path, default=Path("output/consolidated-corpus-report.pdf"))
+    parser.add_argument("--skip-pdf", action="store_true")
+    parser.add_argument("--pdf-engine", default="xelatex")
+    parser.add_argument("--quiet", action="store_true")
+    return parser.parse_args()
+
+

What this block is doing

This block defines the command-line knobs for the file: input paths, cache settings, output paths, and other runtime switches.

+

Flow arrows

Earlier blocks or operator input feed this block. → parse_args → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## load_records + + + + + + +
+
def load_records(args: argparse.Namespace) -> tuple[list[str], list[ct_scan.DatabaseRecord], dict[str, int]]:
+    domains = ct_scan.load_domains(args.domains_file)
+    records: list[ct_scan.DatabaseRecord] = []
+    raw_match_counts: dict[str, int] = {}
+    for domain in domains:
+        raw_match_counts[domain] = ct_scan.query_raw_match_count(domain=domain, attempts=args.retries, verbose=not args.quiet)
+        cached = ct_scan.load_cached_records(
+            cache_dir=args.cache_dir,
+            domain=domain,
+            ttl_seconds=args.cache_ttl_seconds,
+            max_candidates=args.max_candidates_per_domain,
+        )
+        if cached is not None:
+            if not args.quiet:
+                print(f"[cache] domain={domain} records={len(cached)}", file=__import__("sys").stderr)
+            records.extend(cached)
+            continue
+        if not args.quiet:
+            print(f"[query] domain={domain}", file=__import__("sys").stderr)
+        queried = ct_scan.query_domain(
+            domain=domain,
+            max_candidates=args.max_candidates_per_domain,
+            attempts=args.retries,
+            verbose=not args.quiet,
+        )
+        ct_scan.store_cached_records(args.cache_dir, domain, args.max_candidates_per_domain, queried)
+        records.extend(queried)
+    return domains, records, raw_match_counts
+
+

What this block is doing

Loads current CT records for all configured search terms.

+

Flow arrows

Configured domains from the local file. → load_records → `summarize_for_report` uses the returned CT rows as its starting point.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## dns_names_from_hits + + + + + + +
+
def dns_names_from_hits(hits: list[ct_scan.CertificateHit]) -> list[str]:
+    names = sorted(
+        {
+            ct_dns_utils.normalize_name(entry[4:])
+            for hit in hits
+            for entry in hit.san_entries
+            if entry.startswith("DNS:")
+        }
+    )
+    return names
+
+

What this block is doing

This function is one of the building blocks inside `ct_master_report.py`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine.

+

Flow arrows

Earlier blocks or operator input feed this block. → dns_names_from_hits → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## enrich_dns + + + + + + +
+
def enrich_dns(names: list[str], args: argparse.Namespace) -> list[ct_dns_utils.DnsObservation]:
+    observations = [ct_dns_utils.scan_name_cached(name, args.dns_cache_dir, args.dns_cache_ttl_seconds) for name in names]
+    unique_ips = sorted({ip for observation in observations for ip in (*observation.a_records, *observation.aaaa_records)})
+    ptr_cache_dir = args.dns_cache_dir / "ptr"
+    ip_ptrs = {ip: ct_dns_utils.ptr_lookup(ip, ptr_cache_dir, args.dns_cache_ttl_seconds) for ip in unique_ips}
+    for observation in observations:
+        observation.ptr_records = sorted(
+            {
+                ptr
+                for ip in (*observation.a_records, *observation.aaaa_records)
+                for ptr in ip_ptrs.get(ip, [])
+            }
+        )
+        observation.provider_hints = ct_dns_utils.infer_provider_hints(observation)
+        observation.stack_signature = ct_dns_utils.infer_stack_signature(observation)
+    return observations
+
+

What this block is doing

Adds DNS observations and provider clues to the raw SAN-name list.

+

Flow arrows

The unique SAN DNS names from current hits. → enrich_dns → `summarize_for_report` uses the enriched observations for DNS chapters and examples.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## short_issuer_family + + + + + + +
+
def short_issuer_family(issuer_name: str) -> str:
+    lowered = issuer_name.lower()
+    if "amazon" in lowered:
+        return "Amazon"
+    if "sectigo" in lowered or "comodo" in lowered:
+        return "Sectigo/COMODO"
+    if "google trust services" in lowered or "cn=we1" in lowered:
+        return "Google Trust Services"
+    return "Other"
+
+

What this block is doing

This function is one of the building blocks inside `ct_master_report.py`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine.

+

Flow arrows

Earlier blocks or operator input feed this block. → short_issuer_family → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## revocation_counts + + + + + + +
+
def revocation_counts(hits: list[ct_scan.CertificateHit]) -> Counter[str]:
+    return Counter(hit.revocation_status for hit in hits)
+
+

What this block is doing

This function is one of the building blocks inside `ct_master_report.py`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine.

+

Flow arrows

Earlier blocks or operator input feed this block. → revocation_counts → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## is_www_pair + + + + + + +
+
def is_www_pair(hit: ct_scan.CertificateHit) -> bool:
+    dns_names = sorted(entry[4:] for entry in hit.san_entries if entry.startswith("DNS:"))
+    if len(dns_names) != 2:
+        return False
+    plain = [name for name in dns_names if not name.startswith("www.")]
+    return len(plain) == 1 and f"www.{plain[0]}" in dns_names
+
+

What this block is doing

This function is one of the building blocks inside `ct_master_report.py`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine.

+

Flow arrows

Earlier blocks or operator input feed this block. → is_www_pair → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## env_token_count + + + + + + +
+
def env_token_count(name: str) -> int:
+    lowered = name.lower()
+    return sum(1 for token in ENV_TOKENS if token in lowered)
+
+

What this block is doing

This function is one of the building blocks inside `ct_master_report.py`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine.

+

Flow arrows

Earlier blocks or operator input feed this block. → env_token_count → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## dns_zone_count + + + + + + +
+
def dns_zone_count(hit: ct_scan.CertificateHit) -> int:
+    zones = {ct_scan.san_tail_split(entry[4:])[1] for entry in hit.san_entries if entry.startswith("DNS:")}
+    return len(zones)
+
+

What this block is doing

This function is one of the building blocks inside `ct_master_report.py`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine.

+

Flow arrows

Earlier blocks or operator input feed this block. → dns_zone_count → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## zone_root_label + + + + + + +
+
def zone_root_label(name: str) -> str:
+    zone = ct_scan.san_tail_split(name)[1]
+    return zone.split(".")[0].lower()
+
+

What this block is doing

This function is one of the building blocks inside `ct_master_report.py`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine.

+

Flow arrows

Earlier blocks or operator input feed this block. → zone_root_label → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## group_member_hits + + + + + + +
+
def group_member_hits(groups: list[ct_scan.CertificateGroup], hits: list[ct_scan.CertificateHit]) -> dict[str, list[ct_scan.CertificateHit]]:
+    mapping: dict[str, list[ct_scan.CertificateHit]] = {}
+    for group in groups:
+        mapping[group.group_id] = [hits[index] for index in group.member_indices]
+    return mapping
+
+

What this block is doing

This block clusters related items together so later code can analyze them as families instead of as isolated rows.

+

Flow arrows

Earlier blocks or operator input feed this block. → group_member_hits → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## stack_counts_for_hits + + + + + + +
+
def stack_counts_for_hits(member_hits: list[ct_scan.CertificateHit], observation_by_name: dict[str, ct_dns_utils.DnsObservation]) -> Counter[str]:
+    counts: Counter[str] = Counter()
+    for hit in member_hits:
+        for entry in hit.san_entries:
+            if not entry.startswith("DNS:"):
+                continue
+            name = ct_dns_utils.normalize_name(entry[4:])
+            observation = observation_by_name.get(name)
+            if observation is not None:
+                counts[observation.stack_signature] += 1
+    return counts
+
+

What this block is doing

This function is one of the building blocks inside `ct_master_report.py`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine.

+

Flow arrows

Earlier blocks or operator input feed this block. → stack_counts_for_hits → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## confirm_search_premise + + + + + + +
+
def confirm_search_premise(hits: list[ct_scan.CertificateHit], domains: list[str]) -> tuple[int, int]:
+    missing_matching_san = 0
+    subject_not_in_san = 0
+    for hit in hits:
+        dns_names = [entry[4:].lower() for entry in hit.san_entries if entry.startswith("DNS:")]
+        if not any(any(domain in dns_name for domain in domains) for dns_name in dns_names):
+            missing_matching_san += 1
+        if hit.subject_cn.lower() not in dns_names:
+            subject_not_in_san += 1
+    return missing_matching_san, subject_not_in_san
+
+

What this block is doing

This function is one of the building blocks inside `ct_master_report.py`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine.

+

Flow arrows

Earlier blocks or operator input feed this block. → confirm_search_premise → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## provider_counts + + + + + + +
+
def provider_counts(observations: list[ct_dns_utils.DnsObservation]) -> Counter[str]:
+    counts: Counter[str] = Counter()
+    for observation in observations:
+        for hint in observation.provider_hints:
+            if hint != "Unclassified":
+                counts[hint] += 1
+    return counts
+
+

What this block is doing

This function is one of the building blocks inside `ct_master_report.py`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine.

+

Flow arrows

Earlier blocks or operator input feed this block. → provider_counts → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## top_suffixes + + + + + + +
+
def top_suffixes(hits: list[ct_scan.CertificateHit], limit: int = 8) -> list[tuple[str, int]]:
+    counts: Counter[str] = Counter()
+    for hit in hits:
+        labels = hit.subject_cn.lower().split(".")
+        suffix = ".".join(labels[1:]) if len(labels) > 1 else hit.subject_cn.lower()
+        counts[suffix] += 1
+    return counts.most_common(limit)
+
+

What this block is doing

This function is one of the building blocks inside `ct_master_report.py`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine.

+

Flow arrows

Earlier blocks or operator input feed this block. → top_suffixes → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## top_env_tokens + + + + + + +
+
def top_env_tokens(hits: list[ct_scan.CertificateHit], limit: int = 10) -> list[tuple[str, int]]:
+    counts: Counter[str] = Counter()
+    for hit in hits:
+        lowered = hit.subject_cn.lower()
+        for token in ENV_TOKENS:
+            if token in lowered:
+                counts[token] += 1
+    return counts.most_common(limit)
+
+

What this block is doing

This function is one of the building blocks inside `ct_master_report.py`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine.

+

Flow arrows

Earlier blocks or operator input feed this block. → top_env_tokens → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## pick_examples + + + + + + +
+
def pick_examples(
+    hits: list[ct_scan.CertificateHit],
+    groups: list[ct_scan.CertificateGroup],
+    observation_by_name: dict[str, ct_dns_utils.DnsObservation],
+) -> list[ExampleBlock]:
+    examples: list[ExampleBlock] = []
+    group_map = group_member_hits(groups, hits)
+
+    numbered_groups = [group for group in groups if group.group_type == "numbered_cn_pattern"]
+    if numbered_groups:
+        group = max(numbered_groups, key=lambda item: item.member_count)
+        member_hits = group_map[group.group_id]
+        stack_counts = stack_counts_for_hits(member_hits, observation_by_name)
+        example_hit = max(member_hits, key=lambda item: (len(item.san_entries), len(item.subject_cn)))
+        examples.append(
+            ExampleBlock(
+                title="Shared operational rail",
+                subject_cn=example_hit.subject_cn,
+                why_it_matters="A numbered CN family usually signals a reusable service rail rather than a one-off branded page. It tends to expose fleet-style naming, repeated validity cycles, and many sibling hostnames.",
+                evidence=[
+                    f"Group basis: {ct_scan.describe_group_basis(group).replace('`', '')}.",
+                    f"Certificates in family: {group.member_count}.",
+                    f"Distinct Subject CNs in family: {group.distinct_subject_cn_count}.",
+                    f"Top observed DNS delivery stacks: {', '.join(f'{label} ({count})' for label, count in stack_counts.most_common(3)) or 'none'}.",
+                ],
+            )
+        )
+
+    matrix_hits = [hit for hit in hits if len(hit.san_entries) >= 12 and env_token_count(hit.subject_cn) >= 1]
+    if matrix_hits:
+        hit = max(matrix_hits, key=lambda item: (len(item.san_entries), dns_zone_count(item), item.subject_cn))
+        zones = sorted({ct_scan.san_tail_split(entry[4:])[1] for entry in hit.san_entries if entry.startswith("DNS:")})
+        examples.append(
+            ExampleBlock(
+                title="Environment matrix certificate",
+                subject_cn=hit.subject_cn,
+                why_it_matters="A large SAN set with environment-style labels usually means one certificate is covering a coordinated platform surface across test, release, support, or tenant slices.",
+                evidence=[
+                    f"SAN entries: {len(hit.san_entries)}.",
+                    f"Distinct DNS zones in SAN set: {len(zones)}.",
+                    f"Environment tokens visible in Subject CN: {env_token_count(hit.subject_cn)}.",
+                    f"First DNS zones in SAN set: {', '.join(zones[:6])}.",
+                ],
+            )
+        )
+
+    zone_tokens = sorted(
+        {
+            zone_root_label(hit.subject_cn)
+            for hit in hits
+            if "." in hit.subject_cn
+        }
+        | {
+            zone_root_label(entry[4:])
+            for hit in hits
+            for entry in hit.san_entries
+            if entry.startswith("DNS:")
+        }
+    )
+    splice_hits = []
+    for hit in hits:
+        if "." not in hit.subject_cn:
+            continue
+        leading_label = hit.subject_cn.split(".")[0].lower()
+        public_zone = ct_scan.san_tail_split(hit.subject_cn)[1]
+        public_zone_root = public_zone.split(".")[0].lower()
+        foreign_tokens = [token for token in zone_tokens if token != public_zone_root and token in leading_label]
+        if foreign_tokens:
+            splice_hits.append((hit, public_zone, foreign_tokens))
+    if splice_hits:
+        hit, public_zone, foreign_tokens = max(
+            splice_hits,
+            key=lambda item: (dns_zone_count(item[0]), len(item[0].san_entries), item[0].subject_cn),
+        )
+        middle_segment = hit.subject_cn.split(".")[1] if hit.subject_cn.count(".") >= 2 else ""
+        related = sorted(
+            {
+                other.subject_cn
+                for other in hits
+                if middle_segment and f".{middle_segment}." in other.subject_cn
+                and other.subject_cn != hit.subject_cn
+                and ct_scan.san_tail_split(other.subject_cn)[1] == public_zone
+            }
+        )
+        examples.append(
+            ExampleBlock(
+                title="Brand-platform splice",
+                subject_cn=hit.subject_cn,
+                why_it_matters="When the left side of a hostname carries one business or platform label but the public zone belongs to another brand, that usually exposes migration residue or a shared platform being presented through a different public namespace.",
+                evidence=[
+                    f"Subject CN mixes leading-label namespace tokens {', '.join(foreign_tokens[:3])} with the public zone {public_zone}: {hit.subject_cn}.",
+                    f"Distinct DNS zones in SAN set: {dns_zone_count(hit)}.",
+                    f"Representative sibling names in the same middle namespace: {', '.join(related[:5]) or 'none'}.",
+                    f"SAN entries: {len(hit.san_entries)}.",
+                ],
+            )
+        )
+
+    cross_zone_hits = [hit for hit in hits if dns_zone_count(hit) > 1]
+    if cross_zone_hits:
+        hit = max(cross_zone_hits, key=lambda item: (dns_zone_count(item), len(item.san_entries), item.subject_cn))
+        zones = sorted({ct_scan.san_tail_split(entry[4:])[1] for entry in hit.san_entries if entry.startswith("DNS:")})
+        examples.append(
+            ExampleBlock(
+                title="Cross-zone bridge",
+                subject_cn=hit.subject_cn,
+                why_it_matters="When one certificate spans several DNS zones, it often reveals a shared service or a migration bridge between branded fronts and underlying service domains.",
+                evidence=[
+                    f"Distinct DNS zones in SAN set: {len(zones)}.",
+                    f"Representative zones: {', '.join(zones[:8])}.",
+                    f"SAN entries: {len(hit.san_entries)}.",
+                ],
+            )
+        )
+
+    return examples
+
+

What this block is doing

Chooses a few representative examples that make the naming and DNS story understandable.

+

Flow arrows

Current hits, groups, and DNS observations. → pick_examples → `summarize_for_report` stores the chosen examples for the naming chapter.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## build_group_digest + + + + + + +
+
def build_group_digest(
+    groups: list[ct_scan.CertificateGroup],
+    hits: list[ct_scan.CertificateHit],
+    observation_by_name: dict[str, ct_dns_utils.DnsObservation],
+    limit: int = 20,
+) -> list[dict[str, str]]:
+    digest: list[dict[str, str]] = []
+    group_map = group_member_hits(groups, hits)
+    for group in groups[:limit]:
+        member_hits = group_map[group.group_id]
+        stack_counts = stack_counts_for_hits(member_hits, observation_by_name)
+        digest.append(
+            {
+                "group_id": group.group_id,
+                "basis": ct_scan.describe_group_basis(group).replace("`", ""),
+                "type": group.group_type,
+                "certificates": str(group.member_count),
+                "subjects": str(group.distinct_subject_cn_count),
+                "top_stacks": ", ".join(f"{label} ({count})" for label, count in stack_counts.most_common(3)) or "none",
+            }
+        )
+    return digest
+
+

What this block is doing

Builds a compact family catalogue used in reports.

+

Flow arrows

Current groups plus DNS observations. → build_group_digest → Report builders use the digest in appendices and summary tables.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## summarize_for_report + + + + + + +
+
def summarize_for_report(args: argparse.Namespace) -> dict[str, object]:
+    domains, records, raw_match_counts = load_records(args)
+    hits, verification = ct_scan.build_hits(records)
+    groups = ct_scan.build_groups(hits)
+    issuer_trust = ct_scan.query_issuer_trust(hits)
+    classifications = ct_usage_assessment.build_classifications(hits, records)
+    purpose_summary = ct_usage_assessment.summarize(classifications, domains)
+    unique_dns_names = dns_names_from_hits(hits)
+    observations = enrich_dns(unique_dns_names, args)
+    observation_by_name = {observation.original_name: observation for observation in observations}
+    rev_counts = revocation_counts(hits)
+    provider_hint_counts = provider_counts(observations)
+    dns_class_counts = Counter(observation.classification for observation in observations)
+    dns_stack_counts = Counter(observation.stack_signature for observation in observations)
+    issuer_counts = Counter(ct_scan.primary_issuer_name(hit) for hit in hits)
+    issuer_family_counts = Counter(short_issuer_family(name) for name in issuer_counts.elements())
+    missing_matching_san, subject_not_in_san = confirm_search_premise(hits, domains)
+    numbered_groups = [group for group in groups if group.group_type == "numbered_cn_pattern"]
+    public_www_pair_count = sum(1 for hit in hits if is_www_pair(hit))
+    multi_zone_hit_count = sum(1 for hit in hits if dns_zone_count(hit) > 1)
+    examples = pick_examples(hits, groups, observation_by_name)
+    digest = build_group_digest(groups, hits, observation_by_name)
+    trusted_major = sum(1 for info in issuer_trust.values() if info.major_webpki)
+    current_day = datetime.now(UTC).date().isoformat()
+
+    return {
+        "generated_at_utc": ct_scan.utc_iso(datetime.now(UTC)),
+        "current_day": current_day,
+        "domains": domains,
+        "raw_match_counts": raw_match_counts,
+        "cap": args.max_candidates_per_domain,
+        "hits": hits,
+        "groups": groups,
+        "verification": verification,
+        "issuer_trust": issuer_trust,
+        "purpose_summary": purpose_summary,
+        "classifications": classifications,
+        "unique_dns_names": unique_dns_names,
+        "observations": observations,
+        "observation_by_name": observation_by_name,
+        "rev_counts": rev_counts,
+        "provider_hint_counts": provider_hint_counts,
+        "dns_class_counts": dns_class_counts,
+        "dns_stack_counts": dns_stack_counts,
+        "issuer_counts": issuer_counts,
+        "issuer_family_counts": issuer_family_counts,
+        "missing_matching_san": missing_matching_san,
+        "subject_not_in_san": subject_not_in_san,
+        "numbered_groups": numbered_groups,
+        "public_www_pair_count": public_www_pair_count,
+        "multi_zone_hit_count": multi_zone_hit_count,
+        "examples": examples,
+        "top_suffixes": top_suffixes(hits),
+        "top_env_tokens": top_env_tokens(hits),
+        "group_digest": digest,
+        "trusted_major": trusted_major,
+    }
+
+

What this block is doing

Creates the big current-state dictionary consumed by the monograph builder.

+

Flow arrows

Current CT rows, DNS observations, issuer trust, and usage facts. → summarize_for_report → `ct_monograph_report.main` consumes this as the main current-state input.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## md_bullets + + + + + + +
+
def md_bullets(items: list[str]) -> list[str]:
+    return [f"- {item}" for item in items]
+
+

What this block is doing

This function is one of the building blocks inside `ct_master_report.py`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine.

+

Flow arrows

Earlier blocks or operator input feed this block. → md_bullets → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## render_markdown + + + + + + +
+
def render_markdown(path: Path, report: dict[str, object]) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    hits = report["hits"]
+    groups = report["groups"]
+    rev_counts = report["rev_counts"]
+    purpose_summary = report["purpose_summary"]
+    lines: list[str] = []
+    lines.append("# Consolidated CT, Certificate, and DNS Report")
+    lines.append("")
+    lines.append(f"Generated: {report['generated_at_utc']}")
+    lines.append(f"Configured search terms file: `{report['domains']}`")
+    lines.append("")
+    lines.append("## Executive Overview")
+    lines.append("")
+    lines.extend(
+        md_bullets(
+            [
+                f"{len(hits)} current leaf certificates are in scope after local leaf-only verification.",
+                f"{len(groups)} CN families reduce the raw certificate list into readable naming clusters.",
+                f"{purpose_summary.category_counts.get('tls_server_only', 0)} certificates are strict server-auth and {purpose_summary.category_counts.get('tls_server_and_client', 0)} also allow client auth.",
+                f"{len(report['unique_dns_names'])} unique DNS SAN names were scanned live; the estate collapses into a small number of recurring delivery stacks.",
+                "The strongest overall reading is a layered operating model: branded public names on top, reusable service rails underneath, and cloud or vendor delivery platforms at the edge.",
+            ]
+        )
+    )
+    lines.append("")
+    lines.append("## Chapter 1: Method, Integrity, and How To Read This")
+    lines.append("")
+    lines.append("**Management Summary**")
+    lines.append("")
+    lines.extend(
+        md_bullets(
+            [
+                f"The scan now fails fast if the candidate cap is lower than the live raw match count. Current raw counts: {', '.join(f'{domain}={count}' for domain, count in report['raw_match_counts'].items())}.",
+                f"The live candidate cap used for this run was {report['cap']}, which is safely above the current raw counts.",
+                f"Leaf-only verification kept {report['verification'].unique_leaf_certificates} certificates and filtered {report['verification'].non_leaf_filtered} CA-style certificates and {report['verification'].precertificate_poison_filtered} precertificate-poison objects.",
+                f"Every certificate in scope still contains at least one DNS SAN containing one of the configured search terms; exceptions found: {report['missing_matching_san']}.",
+            ]
+        )
+    )
+    lines.append("")
+    lines.append("Certificate Transparency is the public logging layer for issued certificates. The scan starts there, then reads the actual X.509 certificate bytes, verifies that each object is a real leaf certificate, extracts SAN and Subject CN values, checks revocation state from crt.sh data, and then scans the DNS names seen in SANs.")
+    lines.append("")
+    lines.append("A **Subject CN** is the traditional primary name in a certificate. A **SAN** list is the modern list of all names the certificate covers. A **leaf certificate** is the endpoint certificate presented by a service, as distinct from a CA certificate used to sign other certificates.")
+    lines.append("")
+    lines.append("## Chapter 2: Certificate Corpus")
+    lines.append("")
+    lines.append("**Management Summary**")
+    lines.append("")
+    lines.extend(
+        md_bullets(
+            [
+                f"The issuer landscape is concentrated: {', '.join(f'{name} ({count})' for name, count in report['issuer_family_counts'].most_common())}.",
+                f"Revocation mix: {rev_counts.get('not_revoked', 0)} not revoked, {rev_counts.get('revoked', 0)} revoked, {rev_counts.get('unknown', 0)} unknown.",
+                f"Purpose split: {purpose_summary.category_counts.get('tls_server_only', 0)} server-only, {purpose_summary.category_counts.get('tls_server_and_client', 0)} server+client, and zero client-only, S/MIME, or code-signing certificates.",
+                f"All {len(hits)} Subject CN values appear literally in the SAN DNS set.",
+            ]
+        )
+    )
+    lines.append("")
+    lines.append("An **issuer CA** is the certificate authority that signed the endpoint certificate. A **WebPKI-trusted** issuer is one that browsers and operating systems currently trust for public TLS. In this corpus, all visible issuers are live server-auth issuers in the public trust ecosystem.")
+    lines.append("")
+    lines.append("### Issuer Breakdown")
+    lines.append("")
+    for issuer_name, count in report["issuer_counts"].most_common():
+        trust = report["issuer_trust"][issuer_name]
+        lines.append(f"- `{issuer_name}`: {count} certificates | major WebPKI stores: {'yes' if trust.major_webpki else 'no'}")
+    lines.append("")
+    lines.append("### Purpose Assessment")
+    lines.append("")
+    for category, count in purpose_summary.category_counts.items():
+        lines.append(f"- `{category}`: {count}")
+    lines.append("")
+    lines.append(
+        "An **Extended Key Usage (EKU)** value tells software what the certificate is allowed to do. "
+        f"Here the estate is entirely TLS-capable. The only nuance is that {purpose_summary.category_counts.get('tls_server_and_client', 0)} certificates also allow `clientAuth`. "
+        "That does not by itself prove a separate client-certificate estate; in context, they still look like hostname certificates issued from a permissive or older server template."
+    )
+    lines.append("")
+    lines.append("## Chapter 3: Naming Architecture")
+    lines.append("")
+    lines.append("**Management Summary**")
+    lines.append("")
+    lines.extend(
+        md_bullets(
+            [
+                f"{len(report['numbered_groups'])} numbered CN families point to reusable service rails rather than one-off pages.",
+                f"{report['public_www_pair_count']} certificates use the clean public front-door pattern of a base name paired with `www`.",
+                f"{report['multi_zone_hit_count']} certificates span more than one DNS zone in SAN, which is usually a sign of shared platforms, migrations, or multi-brand exposure.",
+                f"Most common suffixes: {', '.join(f'{suffix} ({count})' for suffix, count in report['top_suffixes'])}.",
+            ]
+        )
+    )
+    lines.append("")
+    lines.append("Hostnames often look arbitrary because they are doing several jobs at once. Some names are for customers, some are for engineers, some encode environment state, and some preserve older platform lineage because renaming working infrastructure is costly.")
+    lines.append("")
+    lines.append("### Frequent Naming Tokens")
+    lines.append("")
+    for token, count in report["top_env_tokens"]:
+        lines.append(f"- `{token}`: {count}")
+    lines.append("")
+    lines.append("### Dynamic Examples")
+    lines.append("")
+    for example in report["examples"]:
+        lines.append(f"#### {example.title}")
+        lines.append("")
+        lines.append(f"- Subject CN: `{example.subject_cn}`")
+        lines.append(f"- Why it matters: {example.why_it_matters}")
+        for point in example.evidence:
+            lines.append(f"- Evidence: {point}")
+        lines.append("")
+    lines.append("## Chapter 4: DNS Delivery Architecture")
+    lines.append("")
+    lines.append("**Management Summary**")
+    lines.append("")
+    lines.extend(
+        md_bullets(
+            [
+                f"{len(report['unique_dns_names'])} unique DNS names were scanned from the SAN corpus.",
+                f"DNS classes: {', '.join(f'{label}={count}' for label, count in report['dns_class_counts'].most_common())}.",
+                f"Top delivery signatures: {', '.join(f'{label} ({count})' for label, count in report['dns_stack_counts'].most_common(6))}.",
+                "The DNS layer turns a large hostname set into a smaller number of delivery stacks: CDN edges, API gateways, load balancers, and specialist vendor platforms.",
+            ]
+        )
+    )
+    lines.append("")
+    lines.append("A **CNAME** is a DNS alias, meaning one hostname points to another hostname. An **A** or **AAAA** record is the final address mapping. An **NXDOMAIN** response means the public DNS name does not exist at the moment of the scan. That does not automatically invalidate the certificate-side finding, because certificate and DNS lifecycles can move at different speeds.")
+    lines.append("")
+    lines.append("### Delivery Stack Counts")
+    lines.append("")
+    for label, count in report["dns_stack_counts"].most_common(12):
+        lines.append(f"- `{label}`: {count}")
+    lines.append("")
+    lines.append("### Platform and Provider Explanations")
+    lines.append("")
+    glossary = ct_dns_utils.provider_explanations()
+    seen_terms = set()
+    for observation in report["observations"]:
+        seen_terms.update(observation.provider_hints)
+    for term in ["Adobe Campaign", "AWS", "AWS CloudFront", "AWS ALB", "Google Apigee", "Pega Cloud", "Microsoft Edge", "Infinite / agency alias", "CNAME", "A record", "AAAA record", "PTR record", "NXDOMAIN"]:
+        if term in glossary and (term in seen_terms or term in {"CNAME", "A record", "AAAA record", "PTR record", "NXDOMAIN", "AWS ALB"}):
+            lines.append(f"- **{term}**: {glossary[term]}")
+    lines.append("")
+    lines.append("## Chapter 5: Where The Certificate View and DNS View Meet")
+    lines.append("")
+    lines.append("**Management Summary**")
+    lines.append("")
+    lines.extend(
+        md_bullets(
+            [
+                "The certificate layer describes naming and trust; the DNS layer describes delivery and reachability. The same estate becomes legible only when both are read together.",
+                "Numbered CN families usually behave like shared operational rails in certificates and collapse into repeatable delivery stacks in DNS.",
+                "Cleaner public names tend to be the presentation layer, while denser SAN sets and multi-zone families tend to expose the platform layer underneath.",
+            ]
+        )
+    )
+    lines.append("")
+    lines.append("The common ground is operational reality. A brand or product team wants a recognisable public name. A platform team wants a stable service rail. A delivery team wants environment labels and routable front doors. Certificates and DNS show those layers from different angles, which is why the estate looks messy when read from only one side.")
+    lines.append("")
+    lines.append("### Top Family Digest")
+    lines.append("")
+    for row in report["group_digest"]:
+        lines.append(
+            f"- `{row['group_id']}` | {row['basis']} | type={row['type']} | certs={row['certificates']} | subjects={row['subjects']} | stacks={row['top_stacks']}"
+        )
+    lines.append("")
+    lines.append("## Chapter 6: Confidence, Limits, and Claims")
+    lines.append("")
+    lines.append("**Management Summary**")
+    lines.append("")
+    lines.extend(
+        md_bullets(
+            [
+                "Strongest claims: issuer trust, leaf-only status, SAN and Subject CN structure, purpose EKU split, DNS stack signatures, and recurring family patterns.",
+                "Medium-confidence claims: that the estate reflects a layered organisation with brand, platform, and delivery concerns superimposed on each other.",
+                "Lower-confidence claims: exact meanings of internal abbreviations or exact organisation-chart boundaries inferred from naming alone.",
+            ]
+        )
+    )
+    lines.append("")
+    lines.append("This report can prove what is visible in public certificate and DNS data. It cannot prove internal governance charts or the exact human meaning of every abbreviation. Where the report interprets rather than measures, it does so by tying the interpretation to repeated observable patterns.")
+    lines.append("")
+    path.write_text("\n".join(lines) + "\n", encoding="utf-8")
+
+

What this block is doing

Writes the shorter consolidated report in Markdown.

+

Flow arrows

Earlier blocks or operator input feed this block. → render_markdown → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## tex_escape + + + + + + +
+
def tex_escape(value: str) -> str:
+    return ct_scan.latex_escape(value)
+
+

What this block is doing

This function is one of the building blocks inside `ct_master_report.py`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine.

+

Flow arrows

Earlier blocks or operator input feed this block. → tex_escape → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## render_latex + + + + + + +
+
def render_latex(path: Path, report: dict[str, object]) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    hits = report["hits"]
+    groups = report["groups"]
+    rev_counts = report["rev_counts"]
+    purpose_summary = report["purpose_summary"]
+
+    lines: list[str] = [
+        r"\documentclass[11pt]{article}",
+        r"\usepackage[a4paper,margin=18mm]{geometry}",
+        r"\usepackage{fontspec}",
+        r"\usepackage[table]{xcolor}",
+        r"\usepackage{microtype}",
+        r"\usepackage{hyperref}",
+        r"\usepackage{xurl}",
+        r"\usepackage{array}",
+        r"\usepackage{booktabs}",
+        r"\usepackage{tabularx}",
+        r"\usepackage{longtable}",
+        r"\usepackage{enumitem}",
+        r"\usepackage{fancyhdr}",
+        r"\usepackage{titlesec}",
+        r"\usepackage[most]{tcolorbox}",
+        r"\defaultfontfeatures{Ligatures=TeX,Scale=MatchLowercase}",
+        r"\definecolor{Ink}{HTML}{17202A}",
+        r"\definecolor{Muted}{HTML}{667085}",
+        r"\definecolor{Line}{HTML}{D0D5DD}",
+        r"\definecolor{Panel}{HTML}{F8FAFC}",
+        r"\definecolor{Accent}{HTML}{0F766E}",
+        r"\definecolor{AccentSoft}{HTML}{E6F4F1}",
+        r"\hypersetup{colorlinks=true,linkcolor=Accent,urlcolor=Accent,pdfauthor={CertTransparencySearch},pdftitle={Consolidated CT, Certificate, and DNS Report}}",
+        r"\setlength{\parindent}{0pt}",
+        r"\setlength{\parskip}{6pt}",
+        r"\setlength{\emergencystretch}{4em}",
+        r"\setlength{\headheight}{16pt}",
+        r"\setlength{\tabcolsep}{4.2pt}",
+        r"\renewcommand{\arraystretch}{1.12}",
+        r"\raggedbottom",
+        r"\setcounter{tocdepth}{2}",
+        r"\pagestyle{fancy}",
+        r"\fancyhf{}",
+        r"\renewcommand{\headrulewidth}{0pt}",
+        r"\fancyfoot[C]{\sffamily\footnotesize \thepage}",
+        r"\titleformat{\section}{\sffamily\bfseries\LARGE\color{Ink}\raggedright}{\thesection}{0.8em}{}",
+        r"\titleformat{\subsection}{\sffamily\bfseries\Large\color{Ink}\raggedright}{\thesubsection}{0.8em}{}",
+        r"\tcbset{panel/.style={enhanced,breakable,boxrule=0.55pt,arc=3pt,left=9pt,right=9pt,top=8pt,bottom=8pt,colback=white,colframe=Line}}",
+        r"\newcommand{\SummaryBox}[1]{\begin{tcolorbox}[panel,colback=Panel]#1\end{tcolorbox}}",
+        r"\begin{document}",
+        r"\begin{titlepage}",
+        r"\vspace*{18mm}",
+        r"{\sffamily\bfseries\fontsize{24}{28}\selectfont Consolidated CT, Certificate, and DNS Report\par}",
+        r"\vspace{6pt}",
+        r"{\Large One document for the certificate corpus, naming system, DNS delivery view, and proof boundaries\par}",
+        r"\vspace{18pt}",
+        rf"\textbf{{Generated}}: {tex_escape(report['generated_at_utc'])}\par",
+        rf"\textbf{{Configured search terms file}}: {tex_escape(str(report['domains']))}\par",
+        r"\vspace{12pt}",
+        r"\SummaryBox{"
+        + rf"\textbf{{Headline}}: {len(hits)} leaf certificates, {len(groups)} CN families, {len(report['unique_dns_names'])} DNS names, "
+        + rf"{purpose_summary.category_counts.get('tls_server_only', 0)} strict server-auth certificates, "
+        + rf"{purpose_summary.category_counts.get('tls_server_and_client', 0)} dual-EKU certificates."
+        + r"}",
+        r"\end{titlepage}",
+        r"\tableofcontents",
+        r"\clearpage",
+    ]
+
+    def add_summary(items: list[str]) -> None:
+        lines.append(r"\SummaryBox{\textbf{Management Summary}\begin{itemize}[leftmargin=1.4em]")
+        for item in items:
+            lines.append(rf"\item {tex_escape(item)}")
+        lines.append(r"\end{itemize}}")
+
+    lines.append(r"\section{Method, Integrity, and How To Read This}")
+    add_summary(
+        [
+            f"The scanner now refuses to run if the candidate cap is lower than the live raw match count; current counts are {', '.join(f'{domain}={count}' for domain, count in report['raw_match_counts'].items())}.",
+            f"The live cap used for this run was {report['cap']}.",
+            f"Leaf-only verification kept {report['verification'].unique_leaf_certificates} certificates.",
+            f"Configured search-term coverage failures: {report['missing_matching_san']}.",
+        ]
+    )
+    lines.append(
+        r"Certificate Transparency is the public logging layer for issued certificates. The report starts there, validates the actual X.509 certificate bytes, and then scans the DNS names exposed in SANs. A Subject CN is the traditional primary name in a certificate; a SAN list is the modern set of all names the certificate covers."
+    )
+
+    lines.append(r"\section{Certificate Corpus}")
+    add_summary(
+        [
+            f"{len(hits)} current leaf certificates are in scope.",
+            f"Revocation mix: not revoked={rev_counts.get('not_revoked', 0)}, revoked={rev_counts.get('revoked', 0)}, unknown={rev_counts.get('unknown', 0)}.",
+            f"Purpose split: server-only={purpose_summary.category_counts.get('tls_server_only', 0)}, server+client={purpose_summary.category_counts.get('tls_server_and_client', 0)}.",
+            f"All Subject CN values appear in SAN DNS names.",
+        ]
+    )
+    lines.extend(
+        [
+            r"\subsection{Issuer Breakdown}",
+            r"\begin{longtable}{>{\raggedright\arraybackslash}p{0.67\linewidth} >{\raggedleft\arraybackslash}p{0.12\linewidth} >{\raggedleft\arraybackslash}p{0.12\linewidth}}",
+            r"\toprule",
+            r"Issuer & Count & WebPKI \\",
+            r"\midrule",
+        ]
+    )
+    for issuer_name, count in report["issuer_counts"].most_common():
+        trust = report["issuer_trust"][issuer_name]
+        lines.append(rf"{tex_escape(issuer_name)} & {count} & {'yes' if trust.major_webpki else 'no'} \\")
+    lines.extend([r"\bottomrule", r"\end{longtable}"])
+    lines.append(r"\subsection{Purpose Assessment}")
+    lines.append(r"\begin{itemize}[leftmargin=1.4em]")
+    for category, count in purpose_summary.category_counts.items():
+        lines.append(rf"\item \texttt{{{tex_escape(category)}}}: {count}")
+    lines.append(r"\end{itemize}")
+
+    lines.append(r"\section{Naming Architecture}")
+    add_summary(
+        [
+            f"{len(report['numbered_groups'])} numbered CN families indicate reusable service rails.",
+            f"{report['public_www_pair_count']} certificates use a base-name plus www pairing.",
+            f"{report['multi_zone_hit_count']} certificates span more than one DNS zone in SAN.",
+            f"Most common suffixes are {', '.join(f'{suffix} ({count})' for suffix, count in report['top_suffixes'][:4])}.",
+        ]
+    )
+    lines.append(r"\subsection{Representative Examples}")
+    for example in report["examples"]:
+        lines.append(r"\SummaryBox{")
+        lines.append(rf"\textbf{{{tex_escape(example.title)}}}\par")
+        lines.append(rf"\textbf{{Subject CN}}: \texttt{{{tex_escape(example.subject_cn)}}}\par")
+        lines.append(tex_escape(example.why_it_matters) + r"\par")
+        lines.append(r"\begin{itemize}[leftmargin=1.4em]")
+        for point in example.evidence:
+            lines.append(rf"\item {tex_escape(point)}")
+        lines.append(r"\end{itemize}}")
+
+    lines.append(r"\section{DNS Delivery Architecture}")
+    add_summary(
+        [
+            f"{len(report['unique_dns_names'])} unique DNS names were scanned from SAN.",
+            f"Top delivery signatures are {', '.join(f'{label} ({count})' for label, count in report['dns_stack_counts'].most_common(5))}.",
+            "The DNS view reduces many hostnames into a smaller set of recurring delivery platforms.",
+        ]
+    )
+    lines.extend(
+        [
+            r"\subsection{Delivery Stack Counts}",
+            r"\begin{longtable}{>{\raggedright\arraybackslash}p{0.72\linewidth} >{\raggedleft\arraybackslash}p{0.16\linewidth}}",
+            r"\toprule",
+            r"Stack signature & Count \\",
+            r"\midrule",
+        ]
+    )
+    for label, count in report["dns_stack_counts"].most_common(12):
+        lines.append(rf"{tex_escape(label)} & {count} \\")
+    lines.extend([r"\bottomrule", r"\end{longtable}"])
+
+    lines.append(r"\subsection{Platform Glossary}")
+    lines.append(r"\begin{longtable}{>{\raggedright\arraybackslash}p{0.22\linewidth} >{\raggedright\arraybackslash}p{0.70\linewidth}}")
+    lines.append(r"\toprule")
+    lines.append(r"Term & Explanation \\")
+    lines.append(r"\midrule")
+    glossary = ct_dns_utils.provider_explanations()
+    seen_terms = set()
+    for observation in report["observations"]:
+        seen_terms.update(observation.provider_hints)
+    for term in ["Adobe Campaign", "AWS", "AWS CloudFront", "AWS ALB", "Google Apigee", "Pega Cloud", "Microsoft Edge", "Infinite / agency alias", "CNAME", "A record", "AAAA record", "PTR record", "NXDOMAIN"]:
+        if term in glossary and (term in seen_terms or term in {"CNAME", "A record", "AAAA record", "PTR record", "NXDOMAIN", "AWS ALB"}):
+            lines.append(rf"{tex_escape(term)} & {tex_escape(glossary[term])} \\")
+    lines.extend([r"\bottomrule", r"\end{longtable}"])
+
+    lines.append(r"\section{Where The Certificate View and DNS View Meet}")
+    add_summary(
+        [
+            "Certificates explain naming, trust, and purpose; DNS explains routing, reachability, and platform landing points.",
+            "Numbered families usually behave like shared service rails, while clean two-name SAN pairs usually behave like public presentation fronts.",
+            "The estate becomes coherent when brand, platform, and delivery are treated as different layers of the same system.",
+        ]
+    )
+    lines.extend(
+        [
+            r"\subsection{Top Family Digest}",
+            r"\begin{longtable}{>{\raggedright\arraybackslash}p{0.09\linewidth} >{\raggedright\arraybackslash}p{0.39\linewidth} >{\raggedright\arraybackslash}p{0.15\linewidth} >{\raggedleft\arraybackslash}p{0.09\linewidth} >{\raggedleft\arraybackslash}p{0.09\linewidth} >{\raggedright\arraybackslash}p{0.13\linewidth}}",
+            r"\toprule",
+            r"ID & Basis & Type & Certs & CNs & Top stacks \\",
+            r"\midrule",
+        ]
+    )
+    for row in report["group_digest"]:
+        lines.append(
+            rf"{tex_escape(row['group_id'])} & {tex_escape(row['basis'])} & {tex_escape(row['type'])} & {row['certificates']} & {row['subjects']} & {tex_escape(row['top_stacks'])} \\"
+        )
+    lines.extend([r"\bottomrule", r"\end{longtable}"])
+
+    lines.append(r"\section{Confidence, Limits, and Claims}")
+    add_summary(
+        [
+            "Strong claims in this report are the ones tied directly to certificate fields, DNS answers, and trust records.",
+            "Interpretive claims are constrained to repeated patterns and are stated as readings, not as internal-org certainties.",
+            "The exact meaning of internal abbreviations cannot be proven from CT and DNS alone.",
+        ]
+    )
+    lines.append(
+        r"The report can prove which issuers are used, which EKU patterns exist, which DNS stacks are visible, and which naming families repeat. It cannot prove the exact internal org chart or the exact human expansion of every short token."
+    )
+    lines.append(r"\end{document}")
+    path.write_text("\n".join(lines) + "\n", encoding="utf-8")
+
+

What this block is doing

Writes the shorter consolidated report in LaTeX.

+

Flow arrows

Earlier blocks or operator input feed this block. → render_latex → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## main + + + + + + +
+
def main() -> int:
+    args = parse_args()
+    report = summarize_for_report(args)
+    render_markdown(args.markdown_output, report)
+    render_latex(args.latex_output, report)
+    if not args.skip_pdf:
+        ct_scan.compile_latex_to_pdf(args.latex_output, args.pdf_output, args.pdf_engine)
+    if not args.quiet:
+        print(
+            f"[report] markdown={args.markdown_output} latex={args.latex_output}"
+            + ("" if args.skip_pdf else f" pdf={args.pdf_output}"),
+            file=__import__("sys").stderr,
+        )
+    return 0
+
+

What this block is doing

The standalone command-line entrypoint for the consolidated current-state report.

+

Flow arrows

CLI arguments from the operator. → main → Runs the shorter consolidated current-state report end to end.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ diff --git a/teachingNoobs/ct_monograph_report.md b/teachingNoobs/ct_monograph_report.md new file mode 100644 index 0000000..174d74c --- /dev/null +++ b/teachingNoobs/ct_monograph_report.md @@ -0,0 +1,3349 @@ +# ct_monograph_report.py + +Source file: [`ct_monograph_report.py`](../ct_monograph_report.py) + +Publication builder. This file takes all analytical layers and turns them into the final monograph in Markdown, LaTeX, and PDF. + +Main flow in one line: `current-state bundle + history + CAA + focused cohort -> Markdown/LaTeX/PDF monograph` + +How to read this page: + +- left side: the actual source code block +- right side: a plain-English explanation for a beginner +- read from top to bottom because later blocks depend on earlier ones + +## Module setup + + + + + + +
+
#!/usr/bin/env python3
+
+from __future__ import annotations
+
+import argparse
+from collections import Counter
+from pathlib import Path
+
+import ct_caa_analysis
+import ct_dns_utils
+import ct_focus_subjects
+import ct_lineage_report
+import ct_master_report
+import ct_scan
+
+

What this block is doing

The orchestration and publishing layer that turns all analytical modules into one publication.

+

Flow arrows

Earlier blocks or operator input feed this block. → Module setup → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## parse_args + + + + + + +
+
def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Generate a complete monograph-style CT and DNS report with appendices."
+    )
+    parser.add_argument("--domains-file", type=Path, default=Path("domains.local.txt"))
+    parser.add_argument("--cache-dir", type=Path, default=Path(".cache/ct-search"))
+    parser.add_argument("--dns-cache-dir", type=Path, default=Path(".cache/dns-scan"))
+    parser.add_argument("--caa-cache-dir", type=Path, default=Path(".cache/caa-scan"))
+    parser.add_argument("--history-cache-dir", type=Path, default=Path(".cache/ct-history-v2"))
+    parser.add_argument("--focus-subjects-file", type=Path, default=Path("focus_subjects.local.txt"))
+    parser.add_argument("--cache-ttl-seconds", type=int, default=0)
+    parser.add_argument("--dns-cache-ttl-seconds", type=int, default=86400)
+    parser.add_argument("--caa-cache-ttl-seconds", type=int, default=86400)
+    parser.add_argument("--max-candidates-per-domain", type=int, default=10000)
+    parser.add_argument("--retries", type=int, default=3)
+    parser.add_argument("--markdown-output", type=Path, default=Path("output/corpus/monograph.md"))
+    parser.add_argument("--latex-output", type=Path, default=Path("output/corpus/monograph.tex"))
+    parser.add_argument("--pdf-output", type=Path, default=Path("output/corpus/monograph.pdf"))
+    parser.add_argument("--appendix-markdown-output", type=Path, default=Path(".cache/monograph-temp/appendix-inventory.md"))
+    parser.add_argument("--appendix-latex-output", type=Path, default=Path(".cache/monograph-temp/appendix-inventory.tex"))
+    parser.add_argument("--appendix-pdf-output", type=Path, default=Path(".cache/monograph-temp/appendix-inventory.pdf"))
+    parser.add_argument("--skip-pdf", action="store_true")
+    parser.add_argument("--pdf-engine", default="xelatex")
+    parser.add_argument("--quiet", action="store_true")
+    return parser.parse_args()
+
+

What this block is doing

This block defines the command-line knobs for the file: input paths, cache settings, output paths, and other runtime switches.

+

Flow arrows

Earlier blocks or operator input feed this block. → parse_args → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## build_scan_stats + + + + + + +
+
def build_scan_stats(report: dict[str, object]) -> ct_scan.ScanStats:
+    groups = report["groups"]
+    hits = report["hits"]
+    verification = report["verification"]
+    return ct_scan.ScanStats(
+        generated_at_utc=report["generated_at_utc"],
+        configured_domains=report["domains"],
+        unique_leaf_certificates=len(hits),
+        groups_total=len(groups),
+        groups_multi_member=sum(1 for group in groups if group.member_count > 1),
+        groups_singleton=sum(1 for group in groups if group.member_count == 1),
+        groups_by_type=dict(Counter(group.group_type for group in groups)),
+        verification=verification,
+    )
+
+

What this block is doing

This block constructs a richer higher-level result from simpler inputs.

+

Flow arrows

Earlier blocks or operator input feed this block. → build_scan_stats → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## render_appendix_inventory + + + + + + +
+
def render_appendix_inventory(args: argparse.Namespace, report: dict[str, object]) -> None:
+    stats = build_scan_stats(report)
+    ct_scan.render_markdown_report(
+        args.appendix_markdown_output,
+        report["hits"],
+        report["groups"],
+        stats,
+        report["issuer_trust"],
+    )
+    ct_scan.render_latex_report(
+        args.appendix_latex_output,
+        report["hits"],
+        report["groups"],
+        stats,
+        report["issuer_trust"],
+        show_page_numbers=False,
+    )
+    if not args.skip_pdf:
+        ct_scan.compile_latex_to_pdf(args.appendix_latex_output, args.appendix_pdf_output, args.pdf_engine)
+
+

What this block is doing

Generates the hidden full inventory appendix before the main monograph is assembled.

+

Flow arrows

The current-state report bundle. → render_appendix_inventory → Creates the hidden appendix files that are later embedded into the monograph.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## md_table + + + + + + +
+
def md_table(headers: list[str], rows: list[list[str]]) -> list[str]:
+    lines = [
+        "| " + " | ".join(headers) + " |",
+        "| " + " | ".join(["---"] * len(headers)) + " |",
+    ]
+    for row in rows:
+        lines.append("| " + " | ".join(row) + " |")
+    return lines
+
+

What this block is doing

This function is one of the building blocks inside `ct_monograph_report.py`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine.

+

Flow arrows

Earlier blocks or operator input feed this block. → md_table → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## latex_escape + + + + + + +
+
def latex_escape(value: str) -> str:
+    return ct_scan.latex_escape(value)
+
+

What this block is doing

This function is one of the building blocks inside `ct_monograph_report.py`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine.

+

Flow arrows

Earlier blocks or operator input feed this block. → latex_escape → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## short_issuer + + + + + + +
+
def short_issuer(issuer_name: str) -> str:
+    lowered = issuer_name.lower()
+    if "amazon" in lowered:
+        return "Amazon"
+    if "sectigo" in lowered or "comodo" in lowered:
+        return "Sectigo/COMODO"
+    if "google trust services" in lowered or "cn=we1" in lowered:
+        return "Google Trust Services"
+    return issuer_name
+
+

What this block is doing

This function is one of the building blocks inside `ct_monograph_report.py`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine.

+

Flow arrows

Earlier blocks or operator input feed this block. → short_issuer → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## pct + + + + + + +
+
def pct(count: int, total: int) -> str:
+    if total <= 0:
+        return "0.0%"
+    return f"{(count / total) * 100:.1f}%"
+
+

What this block is doing

This is a small helper that keeps the larger analytical code cleaner and easier to reuse.

+

Flow arrows

Earlier blocks or operator input feed this block. → pct → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## purpose_label + + + + + + +
+
def purpose_label(category: str) -> str:
+    return {
+        "tls_server_only": "TLS server only",
+        "tls_server_and_client": "TLS server and client auth",
+        "client_auth_only": "Client auth only",
+        "smime_only": "S/MIME only",
+        "code_signing_only": "Code signing only",
+        "mixed_or_other": "Mixed or other",
+        "no_eku": "No EKU",
+    }.get(category, category)
+
+

What this block is doing

This function is one of the building blocks inside `ct_monograph_report.py`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine.

+

Flow arrows

Earlier blocks or operator input feed this block. → purpose_label → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## purpose_meaning + + + + + + +
+
def purpose_meaning(category: str) -> str:
+    return {
+        "tls_server_only": "Standard public website or API endpoint certificate.",
+        "tls_server_and_client": "Server certificate whose EKU also permits client-certificate use.",
+        "client_auth_only": "Identity-style certificate for a person, robot, or agent in mTLS.",
+        "smime_only": "Email-signing or email-encryption certificate.",
+        "code_signing_only": "Software-signing certificate rather than a web-endpoint certificate.",
+        "mixed_or_other": "Unusual or mixed EKU combination requiring case-by-case review.",
+        "no_eku": "Certificate without an Extended Key Usage extension.",
+    }.get(category, "Certificate purpose category.")
+
+

What this block is doing

This function is one of the building blocks inside `ct_monograph_report.py`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine.

+

Flow arrows

Earlier blocks or operator input feed this block. → purpose_meaning → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## collapse_issuer_counts_by_family + + + + + + +
+
def collapse_issuer_counts_by_family(issuer_counts: dict[str, int]) -> Counter[str]:
+    families: Counter[str] = Counter()
+    for issuer_name, count in issuer_counts.items():
+        families[short_issuer(issuer_name)] += count
+    return families
+
+

What this block is doing

This function is one of the building blocks inside `ct_monograph_report.py`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine.

+

Flow arrows

Earlier blocks or operator input feed this block. → collapse_issuer_counts_by_family → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## build_issuer_family_rows + + + + + + +
+
def build_issuer_family_rows(report: dict[str, object]) -> list[dict[str, str]]:
+    issuer_trust = report["issuer_trust"]
+    families: dict[str, dict[str, object]] = {}
+    for issuer_name, count in report["issuer_counts"].most_common():
+        family = short_issuer(issuer_name)
+        row = families.setdefault(
+            family,
+            {
+                "family": family,
+                "certificates": 0,
+                "variants": [],
+                "major_webpki": True,
+            },
+        )
+        row["certificates"] += count
+        row["variants"].append(issuer_name)
+        row["major_webpki"] = bool(row["major_webpki"] and issuer_trust[issuer_name].major_webpki)
+    ordered = sorted(
+        families.values(),
+        key=lambda item: (-int(item["certificates"]), str(item["family"]).casefold()),
+    )
+    result: list[dict[str, str]] = []
+    for item in ordered:
+        variant_labels = [
+            str(name).split("CN=")[-1]
+            for name in sorted(item["variants"], key=str.casefold)
+        ]
+        result.append(
+            {
+                "family": str(item["family"]),
+                "certificates": str(item["certificates"]),
+                "variant_count": str(len(variant_labels)),
+                "major_webpki": "yes" if item["major_webpki"] else "no",
+                "variants": ", ".join(variant_labels),
+            }
+        )
+    return result
+
+

What this block is doing

This block constructs a richer higher-level result from simpler inputs.

+

Flow arrows

Earlier blocks or operator input feed this block. → build_issuer_family_rows → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## build_history_args + + + + + + +
+
def build_history_args(args: argparse.Namespace) -> argparse.Namespace:
+    return argparse.Namespace(
+        domains_file=args.domains_file,
+        cache_dir=args.history_cache_dir,
+        cache_ttl_seconds=args.cache_ttl_seconds,
+        max_candidates_per_domain=args.max_candidates_per_domain,
+        retries=args.retries,
+        quiet=args.quiet,
+        markdown_output=Path(".cache/monograph-temp/unused-history.md"),
+        latex_output=Path(".cache/monograph-temp/unused-history.tex"),
+        pdf_output=Path(".cache/monograph-temp/unused-history.pdf"),
+        skip_pdf=True,
+        pdf_engine=args.pdf_engine,
+    )
+
+

What this block is doing

This block constructs a richer higher-level result from simpler inputs.

+

Flow arrows

Earlier blocks or operator input feed this block. → build_history_args → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## historical_repeated_cn_count + + + + + + +
+
def historical_repeated_cn_count(assessment: ct_lineage_report.HistoricalAssessment) -> int:
+    return sum(1 for values in assessment.cn_groups.values() if len(values) > 1)
+
+

What this block is doing

This function is one of the building blocks inside `ct_monograph_report.py`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine.

+

Flow arrows

Earlier blocks or operator input feed this block. → historical_repeated_cn_count → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## truncate_text + + + + + + +
+
def truncate_text(value: str, limit: int = 88) -> str:
+    if len(value) <= limit:
+        return value
+    return value[: limit - 3].rstrip() + "..."
+
+

What this block is doing

This is a small helper that keeps the larger analytical code cleaner and easier to reuse.

+

Flow arrows

Earlier blocks or operator input feed this block. → truncate_text → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## first_list_item + + + + + + +
+
def first_list_item(value: str) -> str:
+    return value.split(", ")[0] if value else "-"
+
+

What this block is doing

This is a small helper that keeps the larger analytical code cleaner and easier to reuse.

+

Flow arrows

Earlier blocks or operator input feed this block. → first_list_item → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## compact_list_items + + + + + + +
+
def compact_list_items(value: str, keep: int = 2, limit: int = 96) -> str:
+    if not value:
+        return "-"
+    parts = value.split(", ")
+    if len(parts) <= keep:
+        return truncate_text(value, limit)
+    return truncate_text(", ".join(parts[:keep]) + f", ... (+{len(parts) - keep} more)", limit)
+
+

What this block is doing

This function is one of the building blocks inside `ct_monograph_report.py`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine.

+

Flow arrows

Earlier blocks or operator input feed this block. → compact_list_items → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## compact_family_basis + + + + + + +
+
def compact_family_basis(value: str) -> str:
+    prefixes = {
+        "CN pattern with running-number slot: ": "Numbered family: ",
+        "Same endpoint CN family (exact CN; www. grouped with base name): ": "Exact endpoint family: ",
+    }
+    for prefix, replacement in prefixes.items():
+        if value.startswith(prefix):
+            return value.replace(prefix, replacement, 1)
+    return truncate_text(value, 92)
+
+

What this block is doing

This function is one of the building blocks inside `ct_monograph_report.py`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine.

+

Flow arrows

Earlier blocks or operator input feed this block. → compact_family_basis → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## latex_table_cell + + + + + + +
+
def latex_table_cell(value: str) -> str:
+    escaped = latex_escape(value)
+    for token in [".", "/", "-", ":", ";", ",", "="]:
+        escaped = escaped.replace(token, token + r"\allowbreak{}")
+    return escaped
+
+

What this block is doing

This function is one of the building blocks inside `ct_monograph_report.py`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine.

+

Flow arrows

Earlier blocks or operator input feed this block. → latex_table_cell → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## append_longtable + + + + + + +
+
def append_longtable(
+    lines: list[str],
+    spec: str,
+    headers: list[str],
+    rows: list[list[str]],
+    *,
+    font: str = "small",
+    tabcolsep: str | None = "3.8pt",
+) -> None:
+    lines.append(r"\begingroup")
+    if font:
+        lines.append(rf"\{font}")
+    if tabcolsep:
+        lines.append(rf"\setlength{{\tabcolsep}}{{{tabcolsep}}}")
+    lines.append(rf"\begin{{longtable}}{{{spec}}}")
+    header_line = " & ".join(latex_escape(header) for header in headers) + r" \\"
+    lines.extend(
+        [
+            r"\toprule",
+            header_line,
+            r"\midrule",
+            r"\endfirsthead",
+            r"\toprule",
+            header_line,
+            r"\midrule",
+            r"\endhead",
+            r"\midrule",
+            rf"\multicolumn{{{len(headers)}}}{{r}}{{\footnotesize\itshape Continued on next page}} \\",
+            r"\midrule",
+            r"\endfoot",
+            r"\bottomrule",
+            r"\endlastfoot",
+        ]
+    )
+    for row in rows:
+        lines.append(" & ".join(latex_table_cell(cell) for cell in row) + r" \\")
+    lines.extend([r"\end{longtable}", r"\endgroup"])
+
+

What this block is doing

Shared LaTeX helper for readable multi-page tables.

+

Flow arrows

Earlier blocks or operator input feed this block. → append_longtable → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## nonzero_purpose_rows + + + + + + +
+
def nonzero_purpose_rows(purpose_rows: list[list[str]]) -> list[list[str]]:
+    return [row for row in purpose_rows if row[1] != "0"]
+
+

What this block is doing

This function is one of the building blocks inside `ct_monograph_report.py`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine.

+

Flow arrows

Earlier blocks or operator input feed this block. → nonzero_purpose_rows → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## driver_summary + + + + + + +
+
def driver_summary(subjects: str, issuers: str) -> str:
+    return f"{truncate_text(first_list_item(subjects), 48)}; {truncate_text(first_list_item(issuers), 28)}"
+
+

What this block is doing

This function is one of the building blocks inside `ct_monograph_report.py`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine.

+

Flow arrows

Earlier blocks or operator input feed this block. → driver_summary → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## counter_text + + + + + + +
+
def counter_text(counter: Counter[str], limit: int = 4) -> str:
+    if not counter:
+        return "-"
+    items = [f"{name} ({count})" for name, count in counter.most_common(limit)]
+    if len(counter) > limit:
+        items.append(f"... (+{len(counter) - limit} more)")
+    return ", ".join(items)
+
+

What this block is doing

This function is one of the building blocks inside `ct_monograph_report.py`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine.

+

Flow arrows

Earlier blocks or operator input feed this block. → counter_text → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## overlap_signal + + + + + + +
+
def overlap_signal(details: str) -> str:
+    parts = []
+    for piece in details.split("; "):
+        if piece.startswith("DN=") or piece.startswith("SANs="):
+            parts.append(piece)
+    return truncate_text("; ".join(parts) if parts else details, 108)
+
+

What this block is doing

This function is one of the building blocks inside `ct_monograph_report.py`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine.

+

Flow arrows

Earlier blocks or operator input feed this block. → overlap_signal → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## caa_source_label + + + + + + +
+
def caa_source_label(source_kind: str) -> str:
+    return {
+        "exact": "Exact-name CAA",
+        "alias_target": "Alias-target CAA",
+        "parent": "Inherited parent CAA",
+        "parent_alias_target": "Inherited parent CAA reached through alias following",
+        "none": "No CAA found",
+    }.get(source_kind, source_kind)
+
+

What this block is doing

This function is one of the building blocks inside `ct_monograph_report.py`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine.

+

Flow arrows

Earlier blocks or operator input feed this block. → caa_source_label → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## caa_policy_label + + + + + + +
+
def caa_policy_label(families: tuple[str, ...]) -> str:
+    if families == ("UNRESTRICTED",):
+        return "No published CAA restriction"
+    if families == ("Amazon",):
+        return "Amazon-only issuance policy"
+    if families == ("DigiCert/QuoVadis", "Sectigo/COMODO"):
+        return "Corporate broad policy"
+    if families == ("Amazon", "DigiCert/QuoVadis", "Sectigo/COMODO"):
+        return "Mixed corporate-plus-Amazon policy"
+    if families == ("Google Trust Services", "Sectigo/COMODO"):
+        return "Google plus Sectigo policy"
+    if "Let's Encrypt" in families or "Telia" in families:
+        return "Vendor-delegated broad policy"
+    return "Mixed named policy"
+
+

What this block is doing

This function is one of the building blocks inside `ct_monograph_report.py`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine.

+

Flow arrows

Earlier blocks or operator input feed this block. → caa_policy_label → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## caa_policy_explanation + + + + + + +
+
def caa_policy_explanation(families: tuple[str, ...]) -> str:
+    if families == ("UNRESTRICTED",):
+        return "No CAA restriction is published, so WebPKI issuance is not limited by DNS policy."
+    if families == ("Amazon",):
+        return "Only Amazon Trust Services identifiers are authorized by DNS policy."
+    if families == ("DigiCert/QuoVadis", "Sectigo/COMODO"):
+        return "The name inherits the broad corporate policy that permits the main non-Amazon public CA families seen in this estate."
+    if families == ("Amazon", "DigiCert/QuoVadis", "Sectigo/COMODO"):
+        return "The name permits both the broad corporate CA set and Amazon Trust Services."
+    if families == ("Google Trust Services", "Sectigo/COMODO"):
+        return "This is a narrow exception that permits Google Trust Services alongside the Sectigo lineage."
+    if "Let's Encrypt" in families or "Telia" in families:
+        return "The allowed CA set is wider and looks delegated to a specialist external platform or vendor."
+    return "The DNS policy allows a mixed set of public CA families."
+
+

What this block is doing

This function is one of the building blocks inside `ct_monograph_report.py`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine.

+

Flow arrows

Earlier blocks or operator input feed this block. → caa_policy_explanation → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## service_anchor_label + + + + + + +
+
def service_anchor_label(name: str, zone: str) -> str:
+    if zone == "other":
+        return name
+    if name == zone:
+        return zone
+    relative = name[: -(len(zone) + 1)]
+    parts = relative.split(".")
+    if not parts:
+        return zone
+    return parts[-1]
+
+

What this block is doing

This function is one of the building blocks inside `ct_monograph_report.py`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine.

+

Flow arrows

Earlier blocks or operator input feed this block. → service_anchor_label → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## caa_zone_policy_rows + + + + + + +
+
def caa_zone_policy_rows(
+    analysis: ct_caa_analysis.CaaAnalysis,
+    zone: str,
+) -> list[list[str]]:
+    rows = ct_caa_analysis.rows_for_zone(analysis, zone)
+    policy_counts = ct_caa_analysis.policy_counter(rows)
+    return [
+        [
+            caa_policy_label(policy),
+            str(count),
+            caa_policy_explanation(policy),
+        ]
+        for policy, count in policy_counts.most_common()
+    ]
+
+

What this block is doing

This function is one of the building blocks inside `ct_monograph_report.py`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine.

+

Flow arrows

Earlier blocks or operator input feed this block. → caa_zone_policy_rows → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## caa_source_rows + + + + + + +
+
def caa_source_rows(analysis: ct_caa_analysis.CaaAnalysis) -> list[list[str]]:
+    return [
+        [
+            caa_source_label(source_kind),
+            str(count),
+            {
+                "exact": "The queried DNS name itself published the effective CAA.",
+                "alias_target": "The queried DNS name resolved through an alias and the effective CAA came from what that alias chain exposed.",
+                "parent": "The leaf name had no CAA, so issuance policy was inherited from a parent DNS node.",
+                "parent_alias_target": "The leaf name inherited from a parent DNS node, and that parent policy was itself exposed through an alias response.",
+                "none": "No effective CAA was found at the name or its parents.",
+            }.get(source_kind, "CAA discovery result."),
+        ]
+        for source_kind, count in analysis.source_kind_counts.most_common()
+    ]
+
+

What this block is doing

This function is one of the building blocks inside `ct_monograph_report.py`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine.

+

Flow arrows

Earlier blocks or operator input feed this block. → caa_source_rows → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## top_caa_overlap_rows + + + + + + +
+
def top_caa_overlap_rows(analysis: ct_caa_analysis.CaaAnalysis, limit: int = 15) -> list[list[str]]:
+    rows = [row for row in analysis.rows if row.current_multi_family_overlap]
+    ordered = sorted(rows, key=lambda row: (row.zone, service_anchor_label(row.name, row.zone), row.name))
+    return [
+        [
+            row.name,
+            row.zone,
+            ", ".join(row.current_covering_families),
+            compact_list_items(", ".join(row.current_covering_subject_cns), keep=2, limit=64),
+        ]
+        for row in ordered[:limit]
+    ]
+
+

What this block is doing

This function is one of the building blocks inside `ct_monograph_report.py`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine.

+

Flow arrows

Earlier blocks or operator input feed this block. → top_caa_overlap_rows → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## top_caa_mismatch_rows + + + + + + +
+
def top_caa_mismatch_rows(analysis: ct_caa_analysis.CaaAnalysis, limit: int = 15) -> list[list[str]]:
+    rows = [row for row in analysis.rows if row.current_policy_mismatch]
+    ordered = sorted(rows, key=lambda row: (row.zone, service_anchor_label(row.name, row.zone), row.name))
+    return [
+        [
+            row.name,
+            row.zone,
+            ", ".join(row.current_covering_families),
+            ", ".join(row.allowed_ca_families) or "UNRESTRICTED",
+            caa_source_label(row.source_kind),
+        ]
+        for row in ordered[:limit]
+    ]
+
+

What this block is doing

This function is one of the building blocks inside `ct_monograph_report.py`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine.

+

Flow arrows

Earlier blocks or operator input feed this block. → top_caa_mismatch_rows → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## caa_concentration_text + + + + + + +
+
def caa_concentration_text(analysis: ct_caa_analysis.CaaAnalysis, zone: str) -> str:
+    rows = [row for row in ct_caa_analysis.rows_for_zone(analysis, zone) if row.current_policy_mismatch or row.current_multi_family_overlap]
+    if not rows:
+        return "none"
+    counts = Counter(service_anchor_label(row.name, zone) for row in rows)
+    return ", ".join(f"{label} ({count})" for label, count in counts.most_common(6))
+
+

What this block is doing

This function is one of the building blocks inside `ct_monograph_report.py`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine.

+

Flow arrows

Earlier blocks or operator input feed this block. → caa_concentration_text → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## focus_comparison_rows + + + + + + +
+
def focus_comparison_rows(focus_analysis: ct_focus_subjects.FocusCohortAnalysis) -> list[list[str]]:
+    return [
+        [
+            "Current direct Subject CN names",
+            str(focus_analysis.current_direct_subjects_count),
+            str(len(focus_analysis.rest_current_subject_dns_classes) and sum(focus_analysis.rest_current_subject_dns_classes.values()) or 0),
+            "Shows whether the cohort is mostly made of live direct front-door names or of carried SAN passengers.",
+        ],
+        [
+            "Current certificates",
+            str(focus_analysis.current_focus_certificate_count),
+            str(focus_analysis.current_rest_certificate_count),
+            "Shows the raw current certificate weight of the cohort against the rest of the estate.",
+        ],
+        [
+            "Issuer families in current certificates",
+            counter_text(focus_analysis.focus_current_issuer_families, 3),
+            counter_text(focus_analysis.rest_current_issuer_families, 3),
+            "Separates the older Sectigo/COMODO-style public web cohort from the Amazon-heavy operational rail population.",
+        ],
+        [
+            "Revoked share inside current certificates",
+            focus_analysis.focus_revoked_share,
+            focus_analysis.rest_revoked_share,
+            "A high revoked share points to rapid replacement churn or short-lived issuance iterations.",
+        ],
+        [
+            "Median SAN entries per current certificate",
+            str(focus_analysis.focus_median_san_entries),
+            str(focus_analysis.rest_median_san_entries),
+            "Small SAN sets usually indicate standalone front doors; large SAN sets usually indicate bundled platform coverage.",
+        ],
+        [
+            "Current multi-zone certificates",
+            str(focus_analysis.focus_multi_zone_certificate_count),
+            str(focus_analysis.rest_multi_zone_certificate_count),
+            "Multi-zone certificates are strong evidence of shared-service or bridge certificates rather than one-name service fronts.",
+        ],
+    ]
+
+

What this block is doing

This function is one of the building blocks inside `ct_monograph_report.py`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine.

+

Flow arrows

Earlier blocks or operator input feed this block. → focus_comparison_rows → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## focus_bucket_details + + + + + + +
+
def focus_bucket_details(
+    focus_analysis: ct_focus_subjects.FocusCohortAnalysis,
+    bucket: str,
+) -> list[ct_focus_subjects.FocusSubjectDetail]:
+    return [
+        detail
+        for detail in focus_analysis.details
+        if detail.taxonomy_bucket == bucket
+    ]
+
+

What this block is doing

This function is one of the building blocks inside `ct_monograph_report.py`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine.

+

Flow arrows

Earlier blocks or operator input feed this block. → focus_bucket_details → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## focus_bucket_examples + + + + + + +
+
def focus_bucket_examples(
+    focus_analysis: ct_focus_subjects.FocusCohortAnalysis,
+    bucket: str,
+    limit: int = 4,
+) -> str:
+    details = focus_bucket_details(focus_analysis, bucket)
+    if not details:
+        return "-"
+    ordered = sorted(
+        details,
+        key=lambda item: (
+            -item.current_direct_certificates,
+            -item.historical_direct_certificates,
+            item.subject_cn.casefold(),
+        ),
+    )
+    names = [detail.subject_cn for detail in ordered[:limit]]
+    if len(ordered) > limit:
+        names.append(f"... (+{len(ordered) - limit} more)")
+    return ", ".join(names)
+
+

What this block is doing

This function is one of the building blocks inside `ct_monograph_report.py`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine.

+

Flow arrows

Earlier blocks or operator input feed this block. → focus_bucket_examples → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## focus_bucket_summary_rows + + + + + + +
+
def focus_bucket_summary_rows(focus_analysis: ct_focus_subjects.FocusCohortAnalysis) -> list[list[str]]:
+    meanings = {
+        "direct_front_door": (
+            "Direct branded, service, identity, or vendor-facing names with small SAN sets and one-zone scope.",
+            "These are the names a human operator is most likely to remember as visible service fronts rather than as hidden platform rails.",
+        ),
+        "platform_matrix_anchor": (
+            "Umbrella certificates with large SAN matrices encoding environment, tenant, service-cell, or monitoring axes.",
+            "These names anchor a managed platform slice rather than a single public page or API front.",
+        ),
+        "ambiguous_legacy": (
+            "Historical residue, carried SAN passengers, opaque labels, or mixed-shape names that no longer fit a clean live pattern.",
+            "This bucket captures the messy edge cases where migration, retirement, or naming opacity matters more than current front-door behavior.",
+        ),
+    }
+    rows: list[list[str]] = []
+    for bucket in ["direct_front_door", "platform_matrix_anchor", "ambiguous_legacy"]:
+        meaning, why = meanings[bucket]
+        rows.append(
+            [
+                ct_focus_subjects.taxonomy_bucket_label(bucket),
+                str(focus_analysis.bucket_counts.get(bucket, 0)),
+                truncate_text(focus_bucket_examples(focus_analysis, bucket), 72),
+                meaning,
+                why,
+            ]
+        )
+    return rows
+
+

What this block is doing

This function is one of the building blocks inside `ct_monograph_report.py`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine.

+

Flow arrows

Earlier blocks or operator input feed this block. → focus_bucket_summary_rows → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## focus_representative_rows + + + + + + +
+
def focus_representative_rows(focus_analysis: ct_focus_subjects.FocusCohortAnalysis) -> list[list[str]]:
+    rows: list[list[str]] = []
+    for bucket in ["direct_front_door", "platform_matrix_anchor", "ambiguous_legacy"]:
+        details = focus_bucket_details(focus_analysis, bucket)
+        if not details:
+            continue
+        ordered = sorted(
+            details,
+            key=lambda item: (
+                -item.current_direct_certificates,
+                -item.historical_direct_certificates,
+                -item.current_non_focus_san_carriers,
+                item.subject_cn.casefold(),
+            ),
+        )
+        for detail in ordered[:4]:
+            rows.append(
+                [
+                    ct_focus_subjects.taxonomy_bucket_label(bucket),
+                    detail.subject_cn,
+                    truncate_text(detail.observed_role, 30),
+                    f"{detail.current_direct_certificates}/{detail.historical_direct_certificates}",
+                    truncate_text(
+                        f"current SANs={detail.current_san_size_span}, historical SANs={detail.historical_san_size_span}, DNS={detail.current_dns_outcome}",
+                        78,
+                    ),
+                ]
+            )
+    return rows
+
+

What this block is doing

This function is one of the building blocks inside `ct_monograph_report.py`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine.

+

Flow arrows

Earlier blocks or operator input feed this block. → focus_representative_rows → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## focus_appendix_rows + + + + + + +
+
def focus_appendix_rows(
+    focus_analysis: ct_focus_subjects.FocusCohortAnalysis,
+    bucket: str,
+) -> list[list[str]]:
+    rows: list[list[str]] = []
+    for detail in focus_bucket_details(focus_analysis, bucket):
+        rows.append(
+            [
+                detail.subject_cn,
+                truncate_text(detail.taxonomy_reason, 40),
+                truncate_text(detail.analyst_note, 28),
+                truncate_text(detail.observed_role, 28),
+                f"{detail.current_direct_certificates}/{detail.historical_direct_certificates}",
+                f"{detail.current_non_focus_san_carriers}/{detail.historical_non_focus_san_carriers}",
+                truncate_text(f"{detail.current_san_size_span}/{detail.historical_san_size_span}", 16),
+                truncate_text(detail.current_dns_outcome, 24),
+                f"revoked={detail.current_revoked_certificates}, live={detail.current_not_revoked_certificates}",
+                truncate_text(detail.current_red_flags, 24),
+                truncate_text(detail.past_red_flags, 24),
+            ]
+        )
+    return rows
+
+

What this block is doing

This function is one of the building blocks inside `ct_monograph_report.py`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine.

+

Flow arrows

Earlier blocks or operator input feed this block. → focus_appendix_rows → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## example_pattern_label + + + + + + +
+
def example_pattern_label(title: str) -> str:
+    return {
+        "Shared operational rail": "Numbered fleet or operational-rail naming",
+        "Environment matrix certificate": "Environment-matrix and lifecycle naming",
+        "Brand-platform splice": "Cross-brand namespace and migration-residue naming",
+        "Cross-zone bridge": "Cross-zone bridge or shared-service naming",
+    }.get(title, "Naming pattern")
+
+

What this block is doing

This function is one of the building blocks inside `ct_monograph_report.py`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine.

+

Flow arrows

Earlier blocks or operator input feed this block. → example_pattern_label → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## delivery_pattern_meaning + + + + + + +
+
def delivery_pattern_meaning(label: str) -> str:
+    return {
+        "Adobe Campaign -> AWS ALB": "The public name first aliases into Adobe Campaign naming and then lands on Amazon load-balancing infrastructure. In plain terms, a messaging or campaign front appears to sit in front of AWS-hosted delivery.",
+        "Adobe Campaign -> AWS CloudFront": "The public name first aliases into Adobe Campaign naming and then into Amazon CloudFront. That usually means campaign or messaging traffic delivered through a CDN edge.",
+        "Adobe Campaign direct IP": "Adobe Campaign naming is visible in the DNS trail, but the public name lands straight on an address rather than on an obvious CDN or load balancer hostname.",
+        "AWS CloudFront": "The public name lands on Amazon's CDN edge without an Adobe layer. This usually means edge delivery for web or API traffic.",
+        "Google Apigee": "The public name lands on a managed API front door. That normally means the endpoint is being exposed as a governed API rather than directly from an application host.",
+        "Pega Cloud -> AWS ALB": "The public name points to Pega-managed application hosting that ultimately lands on AWS load-balancing infrastructure.",
+        "Direct AWS": "The public name lands directly on AWS-hosted infrastructure without a visible intermediary platform in public DNS.",
+        "Direct Microsoft edge": "The public name lands on Microsoft's front-door edge addresses rather than directly on a private application host.",
+        "CNAME to address (provider unclear)": "The public name aliases to another hostname and then to an address, but the public clues are too weak to assign a platform vendor confidently.",
+        "Direct address (provider unclear)": "The public name resolves straight to an address, with no strong provider clue visible in public DNS.",
+        "No public DNS (NXDOMAIN)": "The name contained in certificates does not currently exist in public DNS.",
+        "No public address data": "The name exists in DNS, but no public A or AAAA address was returned during the scan.",
+        "Dangling agency alias": "The name aliases to a third-party intermediary hostname that no longer resolves cleanly. That usually indicates stale or partially removed DNS.",
+    }.get(label, "Recurring public DNS outcome derived from the observed answer chain.")
+
+

What this block is doing

This function is one of the building blocks inside `ct_monograph_report.py`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine.

+

Flow arrows

Earlier blocks or operator input feed this block. → delivery_pattern_meaning → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## delivery_pattern_rule + + + + + + +
+
def delivery_pattern_rule(label: str) -> str:
+    return {
+        "Adobe Campaign -> AWS ALB": "Used when the alias chain contains Adobe Campaign naming and the terminal DNS clues point to AWS load-balancer or AWS-hosted infrastructure.",
+        "Adobe Campaign -> AWS CloudFront": "Used when the alias chain contains Adobe Campaign naming and the terminal target contains CloudFront clues.",
+        "Adobe Campaign direct IP": "Used when Adobe Campaign naming is visible but the name lands directly on an IP address.",
+        "AWS CloudFront": "Used when the terminal DNS target contains CloudFront clues without an Adobe Campaign layer in front of it.",
+        "Google Apigee": "Used when the alias chain or terminal target contains Apigee or Google API gateway clues such as apigee.net.",
+        "Pega Cloud -> AWS ALB": "Used when the DNS trail contains Pega-hosting clues and then AWS load-balancer clues.",
+        "Direct AWS": "Used when the name lands directly on AWS clues without an intermediate branded platform layer.",
+        "Direct Microsoft edge": "Used when the address falls in the public Microsoft front-door ranges used in this heuristic.",
+        "CNAME to address (provider unclear)": "Used when a CNAME chain exists, but no recognized provider clue appears in the public DNS trail.",
+        "Direct address (provider unclear)": "Used when the name resolves directly to an address and no recognized provider clue appears.",
+        "No public DNS (NXDOMAIN)": "Used when the DNS lookup returns NXDOMAIN.",
+        "No public address data": "Used when DNS exists but returns no public address data.",
+        "Dangling agency alias": "Used when the alias chain points to the agency-style intermediary namespace but does not resolve to a live endpoint.",
+    }.get(label, "Derived from the public DNS answer shape and the provider clues seen in names, targets, and PTRs.")
+
+

What this block is doing

This function is one of the building blocks inside `ct_monograph_report.py`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine.

+

Flow arrows

Earlier blocks or operator input feed this block. → delivery_pattern_rule → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## render_markdown + + + + + + +
+
def render_markdown(
+    args: argparse.Namespace,
+    report: dict[str, object],
+    assessment: ct_lineage_report.HistoricalAssessment,
+    caa_analysis: ct_caa_analysis.CaaAnalysis,
+    focus_analysis: ct_focus_subjects.FocusCohortAnalysis | None,
+) -> None:
+    args.markdown_output.parent.mkdir(parents=True, exist_ok=True)
+    appendix_markdown = args.appendix_markdown_output.read_text(encoding="utf-8")
+    hits = report["hits"]
+    groups = report["groups"]
+    purpose_summary = report["purpose_summary"]
+    total_certificates = len(report["classifications"])
+    dual_items = [item for item in report["classifications"] if item.category == "tls_server_and_client"]
+    dual_issuer_counts = Counter(short_issuer(item.issuer_name) for item in dual_items)
+    server_only_count = purpose_summary.category_counts.get("tls_server_only", 0)
+    dual_count = purpose_summary.category_counts.get("tls_server_and_client", 0)
+    server_only_issuer_families = collapse_issuer_counts_by_family(
+        purpose_summary.issuer_breakdown.get("tls_server_only", {})
+    )
+    historical_count = len(assessment.certificates)
+    historical_current_count = sum(1 for item in assessment.certificates if item.current)
+    repeated_cn_count = historical_repeated_cn_count(assessment)
+    purpose_rows = [
+        [
+            purpose_label(category),
+            str(count),
+            pct(count, total_certificates),
+            purpose_meaning(category),
+        ]
+        for category, count in [
+            ("tls_server_only", purpose_summary.category_counts.get("tls_server_only", 0)),
+            ("tls_server_and_client", purpose_summary.category_counts.get("tls_server_and_client", 0)),
+            ("client_auth_only", purpose_summary.category_counts.get("client_auth_only", 0)),
+            ("smime_only", purpose_summary.category_counts.get("smime_only", 0)),
+            ("code_signing_only", purpose_summary.category_counts.get("code_signing_only", 0)),
+            ("mixed_or_other", purpose_summary.category_counts.get("mixed_or_other", 0)),
+            ("no_eku", purpose_summary.category_counts.get("no_eku", 0)),
+        ]
+    ]
+    visible_purpose_rows = nonzero_purpose_rows(purpose_rows)
+    eku_template_rows = [
+        [template, str(count), pct(count, total_certificates)]
+        for template, count in purpose_summary.eku_templates.items()
+    ]
+    key_usage_rows = [
+        [template, str(count), pct(count, total_certificates)]
+        for template, count in purpose_summary.key_usage_templates.items()
+    ]
+    issuer_rows = [
+        [
+            row["family"],
+            row["certificates"],
+            row["variant_count"],
+            row["major_webpki"],
+            row["variants"],
+        ]
+        for row in build_issuer_family_rows(report)
+    ]
+    family_rows = [
+        [
+            compact_family_basis(row["basis"]),
+            str(row["certificates"]),
+            str(row["subjects"]),
+            first_list_item(row["top_stacks"]),
+        ]
+        for row in report["group_digest"]
+    ]
+    dual_rows = [
+        [
+            item.subject_cn,
+            item.valid_from_utc[:10],
+            item.valid_to_utc[:10],
+            short_issuer(item.issuer_name),
+            str(len(item.san_dns_names)),
+        ]
+        for item in dual_items
+    ]
+    dns_stack_rows = [
+        [label, str(count)]
+        for label, count in report["dns_stack_counts"].most_common(12)
+    ]
+    dns_class_counts = report["dns_class_counts"]
+    alias_to_address_count = dns_class_counts.get("cname_to_address", 0)
+    direct_address_count = dns_class_counts.get("direct_address", 0)
+    nxdomain_count = dns_class_counts.get("nxdomain", 0)
+    dangling_count = dns_class_counts.get("dangling_cname", 0)
+    no_data_count = dns_class_counts.get("no_data", 0)
+    top_dns_patterns = report["dns_stack_counts"].most_common(8)
+    dns_pattern_rows = [
+        [label, str(count), delivery_pattern_meaning(label)]
+        for label, count in top_dns_patterns
+    ]
+    focus_comparison = focus_comparison_rows(focus_analysis) if focus_analysis else []
+    focus_bucket_summary = focus_bucket_summary_rows(focus_analysis) if focus_analysis else []
+    focus_representatives = focus_representative_rows(focus_analysis) if focus_analysis else []
+    has_focus = focus_analysis is not None
+    caa_zone_rows = {
+        zone: caa_zone_policy_rows(caa_analysis, zone)
+        for zone in caa_analysis.configured_domains
+    }
+    primary_zone = report["domains"][0] if report["domains"] else "configured primary zone"
+    secondary_zone = report["domains"][1] if len(report["domains"]) > 1 else None
+    synthesis_chapter = 9 if has_focus else 8
+    limits_chapter = 10 if has_focus else 9
+    caa_appendix = "C"
+    focus_appendix = "D" if has_focus else None
+    detailed_inventory_appendix = "E" if has_focus else "D"
+    lines: list[str] = []
+    lines.append("# CT and DNS Monograph")
+    lines.append("")
+    lines.append(f"Generated: {report['generated_at_utc']}")
+    lines.append(f"Configured search terms file: `{args.domains_file.name}`")
+    lines.append("")
+    lines.append("## Executive Summary")
+    lines.append("")
+    lines.extend(
+        [
+            f"- **{len(hits)}** current leaf certificates are in scope on this run.",
+            f"- **{len(groups)}** CN families reduce the estate into readable naming clusters.",
+            f"- **{purpose_summary.category_counts.get('tls_server_only', 0)}** certificates are ordinary public TLS server certificates, while **{purpose_summary.category_counts.get('tls_server_and_client', 0)}** come from templates that also permit client-certificate use.",
+            f"- **{historical_count}** historical leaf certificates show how these names evolved over time, including expired renewal history.",
+            f"- **{len(report['unique_dns_names'])}** unique DNS SAN names were scanned live.",
+            f"- **{caa_analysis.total_names}** DNS names were also assessed for effective CAA policy, revealing where issuance is centrally governed, delegated, or left unrestricted.",
+            "- The estate is best understood as several layers laid on top of one another: brand naming, service naming, platform naming, delivery-stack naming, issuance-policy control, and migration residue.",
+        ]
+    )
+    lines.append("")
+    lines.append("## Reading Guide")
+    lines.append("")
+    lines.extend(
+        [
+            "- Read Chapter 1 if you want to know whether the corpus is complete and trustworthy.",
+            "- Read Chapters 2 and 3 if you want the current certificate-side story: issuers, trust, and purpose.",
+            "- Read Chapter 4 if you want the historical lifecycle view and the red flags split into current versus fixed-in-the-past.",
+            "- Read Chapters 5 and 6 if you want the naming and DNS story.",
+            "- Read Chapter 7 if you want the issuance-policy view: which public CAs are authorized by DNS and where that control is absent, inherited, or delegated.",
+            *(
+                ["- Read Chapter 8 if you want the focused Subject-CN cohort analysis and why that subset behaves differently from the wider estate."]
+                if has_focus
+                else []
+            ),
+            f"- Read Chapter {synthesis_chapter} if you want the synthesis that ties business naming, service architecture, and hosting patterns together.",
+            "- Use the appendices when you need the fine-grained evidence rather than the argument.",
+        ]
+    )
+    lines.append("")
+    lines.append("## Chapter 1: Scope, Completeness, and Proof")
+    lines.append("")
+    lines.append("**Management Summary**")
+    lines.append("")
+    lines.extend(
+        [
+            f"- The first broad crt.sh search returned {', '.join(f'{domain}={count} matching index rows' for domain, count in report['raw_match_counts'].items())}. Those rows are leads, not final certificate count.",
+            f"- The scanner was allowed to collect up to {report['cap']} candidate rows per search term. Because the live match counts stayed below that limit, nothing was silently cut off.",
+            f"- After downloading and parsing the actual certificate bodies, {report['verification'].unique_leaf_certificates} genuine leaf certificates remained. {report['verification'].non_leaf_filtered} CA-style certificates and {report['verification'].precertificate_poison_filtered} precertificate marker objects were rejected.",
+            f"- Certificates missing the searched-for domains in their DNS SANs after full parsing: {report['missing_matching_san']}.",
+        ]
+    )
+    lines.append("")
+    lines.append("This chapter answers the first and most important question: whether the report is built on a complete and trustworthy corpus. The scanner now checks the live raw match count before issuing the capped query. If the cap is too low, it fails instead of silently undercounting.")
+    lines.append("")
+    lines.append("The first crt.sh row count is intentionally larger than the final certificate count because Certificate Transparency search results are index rows, not de-duplicated certificates. The report therefore reads the binary certificate body itself, removes duplicates, rejects CA certificates and precertificate marker objects, and only then builds the working corpus.")
+    lines.append("")
+    lines.append("In other words: this publication is not based on search-result snippets alone. It is based on the parsed X.509 certificate bodies.")
+    lines.append("")
+    lines.append("## Chapter 2: The Certificate Corpus")
+    lines.append("")
+    lines.append("**Management Summary**")
+    lines.append("")
+    lines.extend(
+        [
+            f"- Issuer families by certificate count: {', '.join(f'{name} ({count})' for name, count in report['issuer_family_counts'].most_common())}.",
+            f"- Revocation state in plain terms: {report['rev_counts'].get('not_revoked', 0)} certificates are not marked revoked, and {report['rev_counts'].get('revoked', 0)} were later marked invalid by their issuing CA before natural expiry.",
+            f"- For every current certificate, the main Subject CN hostname also appears literally in the DNS SAN list. The headline name on the certificate is therefore one of the real covered hostnames, not a decorative label.",
+            f"- All visible issuer families in this corpus are currently trusted by the major public browser and operating-system trust stores for ordinary web server use.",
+        ]
+    )
+    lines.append("")
+    lines.append("A certificate corpus can look random when viewed as a flat list. It becomes intelligible once you group it by issuer family, Subject CN construction, validity history, and SAN design. That is why the appendices are arranged as families rather than raw rows.")
+    lines.append("")
+    lines.append("### Issuer Trust Table")
+    lines.append("")
+    lines.extend(md_table(["Issuer Family", "Certificates", "Variants", "Major WebPKI"], [row[:4] for row in issuer_rows]))
+    lines.append("")
+    lines.append("**What WebPKI trust means**")
+    lines.append("")
+    lines.append("A WebPKI-trusted issuer is a certificate authority trusted by mainstream browser and operating-system trust stores for public TLS. That matters because it tells you these certificates are not part of a private PKI hidden inside one organisation. They are intended to be valid in the public Internet trust model.")
+    lines.append("")
+    lines.append("This view should answer one question only: how many publicly trusted issuer families are present in the estate. The exact subordinate issuer names are supporting evidence, so they stay in the appendix inventory rather than cluttering the main chapter.")
+    lines.append("")
+    lines.append("## Chapter 3: Intended Purpose of the Certificates")
+    lines.append("")
+    lines.append("**Management Summary**")
+    lines.append("")
+    lines.extend(
+        [
+            f"- Certificates whose allowed purpose is ordinary server authentication only: {purpose_summary.category_counts.get('tls_server_only', 0)}.",
+            f"- Certificates whose policy allows both server use and client-certificate use: {purpose_summary.category_counts.get('tls_server_and_client', 0)}.",
+            "- Certificates dedicated only to client identity, email signing, or code signing: 0.",
+        ]
+    )
+    lines.append("")
+    lines.append("This chapter addresses a key ambiguity. A certificate can be technically valid for several uses, and the hostname alone does not settle that question. The corpus was therefore assessed from the X.509 usage fields themselves: EKU and KeyUsage.")
+    lines.append("")
+    lines.append("### Purpose Map")
+    lines.append("")
+    lines.extend(md_table(["Usage Class", "Certificates", "Share", "Meaning"], visible_purpose_rows))
+    lines.append("")
+    lines.append("This view should answer only what kind of certificates these are. Zero-count categories are deliberately removed here because they add noise without changing the conclusion.")
+    lines.append("")
+    lines.append("The basic picture is simple: the corpus is overwhelmingly made of ordinary public TLS server certificates, with a smaller minority whose EKU also permits client-certificate use.")
+    lines.append("")
+    lines.append("**Plain-language explanation of the usage categories**")
+    lines.append("")
+    lines.extend(
+        [
+            "- **TLS server certificate**: the certificate a website or API presents to a browser, app, or machine client.",
+            "- **Server and client auth certificate**: a certificate whose EKU allows both server use and client-certificate use. That does not automatically mean it is actually used as a client certificate, but it leaves that door open.",
+            "- **Client auth only**: the kind of certificate you would expect for a user, robot, or agent identity in mutual TLS.",
+            "- **S/MIME**: email-signing or email-encryption certificates.",
+            "- **Code signing**: certificates used to sign software rather than to secure a web endpoint.",
+        ]
+    )
+    lines.append("")
+    lines.append("The result is clean. This corpus is entirely TLS-capable. There is no evidence of a separate S/MIME or code-signing estate, and there are no client-auth-only certificates.")
+    lines.append("")
+    lines.append("### EKU and KeyUsage Templates")
+    lines.append("")
+    lines.append("At the template level, the corpus is even simpler than the certificate count suggests. Here, a template simply means a repeated combination of usage fields. Only two EKU combinations appear at all, and one KeyUsage pattern dominates almost completely.")
+    lines.append("")
+    lines.extend(md_table(["EKU Template", "Certificates", "Share"], eku_template_rows))
+    lines.append("")
+    lines.extend(md_table(["KeyUsage Template", "Certificates", "Share"], key_usage_rows))
+    lines.append("")
+    lines.append("### The Majority Pattern: Server-Only Public TLS")
+    lines.append("")
+    lines.extend(
+        [
+            f"- Server-only certificates account for {server_only_count} of {total_certificates} certificates, or {pct(server_only_count, total_certificates)} of the corpus.",
+            f"- Server-only validity starts are split between {', '.join(f'{year} ({count})' for year, count in purpose_summary.validity_start_years.get('tls_server_only', {}).items())}.",
+            f"- Server-only issuer-family concentration: {', '.join(f'{name} ({count})' for name, count in server_only_issuer_families.most_common())}.",
+            "- This is the normal public WebPKI server-certificate pattern for websites, APIs, and edge service front doors.",
+        ]
+    )
+    lines.append("")
+    lines.append("This majority group is not background noise. It is the main operational reality visible in the scan: public DNS names covered by publicly trusted endpoint certificates.")
+    lines.append("")
+    if dual_rows:
+        lines.append("### The Minority Pattern: Dual EKU")
+        lines.append("")
+        lines.append("EKU means *allowed purpose*, not *observed real-world use*. A dual-EKU certificate is a certificate whose X.509 policy says it may be used both as a TLS server certificate and as a TLS client certificate.")
+        lines.append("")
+        lines.extend(
+            [
+                f"- Dual-EKU certificates in this corpus: {dual_count}, or {pct(dual_count, total_certificates)} of the corpus.",
+                f"- Issuer-family concentration inside the dual-EKU group: {', '.join(f'{name} ({count})' for name, count in dual_issuer_counts.most_common())}.",
+                f"- Dual-EKU Subject CN families that also have a strict server-only sibling: {len(purpose_summary.dual_eku_subject_cns_with_server_only_sibling)}.",
+                f"- Dual-EKU Subject CN families that appear only in the dual-EKU group: {len(purpose_summary.dual_eku_subject_cns_without_server_only_sibling)}.",
+                f"- Dual-EKU validity starts are split between {', '.join(f'{year} ({count})' for year, count in purpose_summary.validity_start_years.get('tls_server_and_client', {}).items())}.",
+            ]
+        )
+        lines.append("")
+        lines.append("The important interpretation point is this: these still look like public hostname certificates, not person or robot identity certificates. They have DNS-style Subject CN values, DNS SAN lists, and public WebPKI issuers. The best reading is therefore not 'this is a separate client-certificate estate', but rather 'some server certificates were issued from a template that also allowed clientAuth'.")
+        lines.append("")
+    lines.append("### What Is Not Present")
+    lines.append("")
+    lines.extend(
+        [
+            "- There are no client-auth-only certificates in the corpus.",
+            "- There are no S/MIME certificates in the corpus.",
+            "- There are no code-signing certificates in the corpus.",
+            "- There are no mixed-or-other EKU combinations and no certificates missing EKU entirely.",
+        ]
+    )
+    lines.append("")
+    lines.append("## Chapter 4: Historical Renewal, Drift, and Red Flags")
+    lines.append("")
+    lines.append("**Management Summary**")
+    lines.append("")
+    lines.extend(
+        [
+            f"- Looking across expired and current history, the corpus contains {historical_count} leaf certificates; {historical_current_count} of them are still valid today.",
+            f"- {repeated_cn_count} Subject CN values recur over time rather than appearing as one-off singletons.",
+            f"- {assessment.normal_reissuance_assets} renewal families look operationally normal: predecessor and successor overlap for fewer than 50 days.",
+            f"- {len(assessment.overlap_current_rows)} names still show long overlap of 50 days or more today.",
+            f"- {len(assessment.overlap_past_rows)} names showed the same long-overlap behaviour in the past, but not anymore in currently valid certificates.",
+            f"- Current non-overlap anomalies are limited: {len(assessment.dn_current_rows)} live Subject DN drift cases, {len(assessment.vendor_current_rows)} live CA-family drift cases, and {len(assessment.san_current_rows)} live SAN-drift cases.",
+            f"- Past-only fixed anomalies were broader: {len(assessment.dn_past_rows)} historical Subject DN drift cases, {len(assessment.vendor_past_rows)} historical CA-family drift cases, and {len(assessment.san_past_rows)} historical SAN-drift cases.",
+        ]
+    )
+    lines.append("")
+    lines.append("This chapter is the historical check on whether the current picture follows a clean renewal pattern. It answers a different question from the current-corpus chapters above: not just what certificates exist now, but how the hostname estate has behaved over time.")
+    lines.append("")
+    lines.append("For this chapter, a renewal family means repeated certificates that keep the same apparent identity over time: the same Subject CN, the same full Subject DN, the same SAN profile, and the same CA family. A normal renewal reissues that same apparent certificate identity with a new key and a new validity span, and predecessor and successor overlap only briefly. In this monograph, anything below 50 days of overlap is treated as normal. Fifty days or more is treated as a red flag. COMODO and Sectigo are treated as one CA family from the outset, so movement between those names is not counted here as CA-family drift.")
+    lines.append("")
+    lines.append("A red flag in this chapter is not the same thing as a breach or a compromise. It means the certificate history diverged from the clean rollover pattern that one would normally expect and therefore deserves closer review.")
+    lines.append("")
+    lines.append("### Current Red-Flag Inventory")
+    lines.append("")
+    if assessment.current_red_flag_rows:
+        lines.extend(
+            md_table(
+                ["Subject CN", "Live Certs", "Current Concern", "Immediate Supporting Context"],
+                [
+                    [
+                        row.subject_cn,
+                        str(row.current_certificate_count),
+                        row.flags,
+                        truncate_text(row.notes, 72),
+                    ]
+                    for row in assessment.current_red_flag_rows[:25]
+                ],
+            )
+        )
+    else:
+        lines.append("No current red flags were found under the configured rules.")
+    lines.append("")
+    lines.append("### Past Red Flags Now Fixed")
+    lines.append("")
+    if assessment.past_red_flag_rows:
+        lines.extend(
+            md_table(
+                ["Subject CN", "Historic Certs", "Historical Concern", "Immediate Supporting Context"],
+                [
+                    [
+                        row.subject_cn,
+                        str(row.certificate_count),
+                        row.flags,
+                        truncate_text(row.notes, 72),
+                    ]
+                    for row in assessment.past_red_flag_rows[:25]
+                ],
+            )
+        )
+    else:
+        lines.append("No past-only red flags were found under the configured rules.")
+    lines.append("")
+    lines.append("### What The Historical Red Flags Mean")
+    lines.append("")
+    lines.append("The two short tables above are screening tables. They answer which names deserve attention now, and which names used to be problematic but no longer look live. The appendices below keep the narrower evidence tables that explain why each name is there.")
+    lines.append("")
+    lines.extend(
+        [
+            f"- **Overlap red flag**: a predecessor and successor inside the same renewal family coexist for 50 days or more. Current cases: {len(assessment.overlap_current_rows)}. Past-only fixed cases: {len(assessment.overlap_past_rows)}.",
+            f"- **Subject DN drift**: the same Subject CN appears under more than one full Subject DN. In plain terms, the headline hostname is being issued under different formal subject identities. Current cases: {len(assessment.dn_current_rows)}. Past-only fixed cases: {len(assessment.dn_past_rows)}.",
+            f"- **CA-family drift**: the same Subject CN appears under more than one CA family, after collapsing COMODO and Sectigo together. Current cases: {len(assessment.vendor_current_rows)}. Past-only fixed cases: {len(assessment.vendor_past_rows)}.",
+            f"- **SAN drift**: the same Subject CN appears with more than one SAN profile. In plain terms, the hostname keeps being bundled with different companion names. Current cases: {len(assessment.san_current_rows)}. Past-only fixed cases: {len(assessment.san_past_rows)}.",
+            f"- **Exact issuer-name changes** inside one CA family also exist: {len(assessment.issuer_rows)} Subject CN values. Those are tracked as context, not as first-order red flags.",
+        ]
+    )
+    lines.append("")
+    lines.append("### Historical Step Changes")
+    lines.append("")
+    lines.extend(
+        [
+            f"- Top issuance start dates: {', '.join(f'{row.start_day} ({row.certificate_count})' for row in assessment.day_rows[:6])}.",
+            f"- Strong step weeks: {', '.join(f'{row.week_start} ({row.certificate_count} vs prior avg {row.prior_eight_week_avg})' for row in assessment.week_rows[:4]) or 'none'}.",
+            "- These bursts matter because they show where certificate behaviour was driven by platform-scale operations rather than one-off manual issuance.",
+        ]
+    )
+    lines.append("")
+    lines.append("## Chapter 5: Naming Architecture")
+    lines.append("")
+    lines.append("**Management Summary**")
+    lines.append("")
+    lines.extend(
+        [
+            f"- Numbered CN families: {len(report['numbered_groups'])}.",
+            f"- Multi-zone SAN sets: {report['multi_zone_hit_count']}.",
+            f"- Frequent naming tokens: {', '.join(f'{token} ({count})' for token, count in report['top_env_tokens'][:8])}.",
+            "- The strongest naming signals come from numbered rails, environment markers, cross-brand labels, and cross-zone SAN composition. `www` is weak evidence either way.",
+        ]
+    )
+    lines.append("")
+    lines.append("What looks arbitrary at first glance is usually the result of different naming pressures colliding. Customer-facing naming wants short memorable brands. Platform naming wants stable operational rails. Delivery naming wants environment labels, release slots, or fleet indices. Migration naming preserves old labels because changing a working name can be risky and expensive.")
+    lines.append("")
+    lines.append("### How To Read The Names")
+    lines.append("")
+    lines.extend(
+        [
+            "- In most of these names, the left-most label tells you the endpoint role, node slot, or environment slice, while the zone on the right tells you which public namespace the service is answering under.",
+            "- Standard delivery shorthand appears throughout the corpus: `dev`, `qa`, `uat`, `sit`, `stg`, `preprod`, and `prod` are ordinary environment markers rather than mysterious product names.",
+            "- `www` is a weak signal both when present and when absent. Its presence often reflects compatibility, redirect history, or old web conventions; its absence does not imply any deeper architectural distinction.",
+            "- In this corpus, `nwg` reads as NatWest Group shorthand. Names like `rbs`, `natwest`, `ulsterbank`, `lombard`, `natwestpayments`, `coutts`, and `nwgwealth` are best read as parallel business or service namespaces within a wider shared estate, not as random unrelated domains.",
+            "- Some short forms remain inferential rather than provable. For example, `nft` clearly behaves like a non-production stage label, but Certificate Transparency alone cannot prove the local expansion used inside the company.",
+        ]
+    )
+    lines.append("")
+    lines.append("### Key Pattern Examples")
+    lines.append("")
+    lines.append("These four boxes are not four isolated hostnames. Each one uses a concrete Subject CN as the evidence anchor for a broader naming methodology that appears elsewhere in the estate as well.")
+    lines.append("")
+    for example in report["examples"]:
+        lines.append(f"#### {example.title}")
+        lines.append("")
+        lines.append(f"- Pattern shown: {example_pattern_label(example.title)}.")
+        lines.append(f"- Concrete example: `{example.subject_cn}`")
+        lines.append(f"- What this proves: {example.why_it_matters}")
+        for point in example.evidence:
+            lines.append(f"- Evidence: {point}")
+        lines.append("")
+    lines.append("### Why These Four Examples")
+    lines.append("")
+    lines.append("Taken together, these four examples explain most of the naming behaviour in the corpus. The first shows platform fleet naming, the second shows environment-and-release naming, the third shows cross-brand namespace splicing and migration residue, and the fourth shows shared-service bridging across several business namespaces.")
+    lines.append("")
+    lines.append("## Chapter 6: DNS Delivery Architecture")
+    lines.append("")
+    lines.append("**Management Summary**")
+    lines.append("")
+    lines.extend(
+        [
+            f"- Most names resolve indirectly: {alias_to_address_count} public names first point to another hostname and only then reach an address, while only {direct_address_count} names resolve straight to an address.",
+            f"- The most common public DNS outcomes are Adobe Campaign in front of AWS load-balancing ({report['dns_stack_counts'].get('Adobe Campaign -> AWS ALB', 0)}), Adobe Campaign in front of AWS CloudFront ({report['dns_stack_counts'].get('Adobe Campaign -> AWS CloudFront', 0)}), and plain AWS CloudFront without an Adobe layer ({report['dns_stack_counts'].get('AWS CloudFront', 0)}).",
+            f"- Smaller but still meaningful subsets behave like managed API fronts or specialist application platforms: Google Apigee ({report['dns_stack_counts'].get('Google Apigee', 0)}) and Pega Cloud on AWS ({report['dns_stack_counts'].get('Pega Cloud -> AWS ALB', 0)}).",
+            f"- Some certificate names do not lead to a live public endpoint today: {nxdomain_count} do not exist in public DNS at all, {dangling_count} still exist only as broken aliases, and {no_data_count} exist in DNS but returned no public A or AAAA address during the scan.",
+        ]
+    )
+    lines.append("")
+    lines.append("DNS is the public routing layer. It does not tell you everything about an application, but it does tell you where a public name lands: directly on an IP, through an alias chain, through a CDN, through an API gateway, or onto a specialist platform.")
+    lines.append("")
+    lines.append("This chapter does not claim to know the full private architecture behind each service. It only claims what the public DNS trail supports. For each DNS SAN name in the certificate corpus, the scanner queried public `CNAME`, `A`, `AAAA`, and `PTR` data. It then summarized that public answer trail with a short label. Those labels are not arbitrary brand names invented by the report; they are compact descriptions of what the public DNS evidence most strongly suggests.")
+    lines.append("")
+    lines.append("One important caution follows from that last bullet: a hostname can remain visible in certificate history even after its public DNS has been removed or partially dismantled. Certificate history and current DNS are related, but they do not move in lockstep.")
+    lines.append("")
+    lines.append("### How The DNS Evidence Is Read")
+    lines.append("")
+    lines.extend(
+        [
+            "- A `CNAME` shows that one public name is really an alias for another public name.",
+            "- The terminal hostname, returned addresses, and reverse-DNS names often reveal platform clues such as `cloudfront.net`, `elb.amazonaws.com`, `apigee.net`, or `campaign.adobe.com`.",
+            "- The report combines the answer shape and those clues into one short description. For example, `Adobe Campaign -> AWS ALB` means the alias chain contains Adobe Campaign naming and the terminal clues point to AWS load-balancing infrastructure.",
+            "- These labels are therefore evidence summaries, not claims of legal ownership or full internal design.",
+        ]
+    )
+    lines.append("")
+    lines.append("### What The Public DNS Names Resolve To")
+    lines.append("")
+    lines.extend(md_table(["Observed DNS Outcome", "Count", "Plain-Language Meaning"], dns_pattern_rows))
+    lines.append("")
+    lines.append("### Why Each DNS Label Was Used")
+    lines.append("")
+    for label, _count in top_dns_patterns[:6]:
+        lines.append(f"- **{label}**: {delivery_pattern_rule(label)}")
+    lines.append("")
+    lines.append("### Platform And DNS Glossary")
+    lines.append("")
+    glossary = ct_dns_utils.provider_explanations()
+    for term in ["Adobe Campaign", "AWS", "AWS ALB", "AWS CloudFront", "Google Apigee", "Pega Cloud", "Microsoft Edge", "Infinite / agency alias", "CNAME", "A record", "AAAA record", "PTR record", "NXDOMAIN"]:
+        lines.append(f"- **{term}**: {glossary[term]}")
+    lines.append("")
+    lines.append("The glossary terms above are the building blocks used in the DNS-outcome table. This is also why the management summary mentions Adobe Campaign, CloudFront, Apigee, and Pega at all: not because brand names are the point, but because those names reveal what kind of public delivery role a hostname is landing on. CloudFront suggests a distribution edge, Apigee suggests managed API exposure, Adobe Campaign suggests a marketing or communications front, and a load balancer suggests traffic distribution to backend services.")
+    lines.append("")
+    lines.append("The next chapter stays with the same names but moves from delivery to control. This chapter asked where public traffic lands. The next one asks which public CA families DNS currently authorizes to issue for those same names.")
+    lines.append("")
+    lines.append("## Chapter 7: DNS Issuance Policy Control (CAA)")
+    lines.append("")
+    lines.append("**Management Summary**")
+    lines.append("")
+    for zone in caa_analysis.configured_domains:
+        zone_rows = ct_caa_analysis.rows_for_zone(caa_analysis, zone)
+        unrestricted_count = sum(1 for row in zone_rows if not row.allowed_ca_families)
+        mismatch_count = sum(1 for row in zone_rows if row.current_policy_mismatch)
+        overlap_count = sum(1 for row in zone_rows if row.current_multi_family_overlap)
+        dominant_policy = ct_caa_analysis.policy_counter(zone_rows).most_common(1)
+        dominant_label = caa_policy_label(dominant_policy[0][0]) if dominant_policy else "none"
+        lines.append(
+            f"- `{zone}`: {len(zone_rows)} names in scope; dominant policy is {dominant_label}; unrestricted names={unrestricted_count}; current policy-mismatch names={mismatch_count}; current multi-family overlap names={overlap_count}."
+        )
+    lines.extend(
+        [
+            f"- Effective CAA discovery paths across all names: {', '.join(f'{caa_source_label(kind)}={count}' for kind, count in caa_analysis.source_kind_counts.most_common())}.",
+            f"- Current names simultaneously covered by more than one live CA family: {len(caa_analysis.multi_family_overlap_names)}.",
+            f"- Current names whose live certificate family does not match today's published CAA policy: {len(caa_analysis.policy_mismatch_names)}.",
+        ]
+    )
+    lines.append("")
+    lines.append("CAA is the DNS control layer for public certificate issuance. It does not validate a certificate after issuance; instead, it tells a public CA which CA families are authorized to issue for a DNS name if any restriction is published at all. If no CAA is published, WebPKI issuance is unrestricted from the DNS-policy point of view.")
+    lines.append("")
+    lines.append("This chapter is the control-plane counterpart to the certificate and DNS chapters. The certificate chapter showed who actually issued. The DNS chapter showed where the names land. The CAA chapter shows which issuers the DNS owner currently allows for those same names.")
+    lines.append("")
+    lines.append("That distinction matters because hosting and issuance are different decisions. A name can land on AWS and still use a Sectigo-family certificate if DNS policy allows it. A name can also resolve through a vendor platform while still inheriting a first-party corporate CAA policy. The point of this chapter is to show where those decisions line up and where they do not.")
+    lines.append("")
+    lines.append("CAA is checked per DNS name requested in the certificate, not per Subject DN and not per organisational story. A Subject CN can therefore shift between different Subject DN values without creating a CAA clash, because CAA ignores organisation fields and looks only at the DNS names being certified.")
+    lines.append("")
+    lines.append("### Why CAA Matters In This Estate")
+    lines.append("")
+    lines.extend(
+        [
+            "- If a name has no CAA, DNS is not constraining which public CA family may issue for it.",
+            "- If a name inherits a broad corporate policy, that usually means the organisation has left normal brand-facing names under a common default.",
+            "- If a name falls under a narrower subtree or alias-derived policy, that is evidence of more deliberate platform or vendor-specific issuance control.",
+            "- If a live certificate family sits outside today's CAA policy, or if the same DNS name is live under two CA families at once, that usually points to migration lag, overlapping rollout, or policy that moved faster than certificate cleanup.",
+        ]
+    )
+    lines.append("")
+    lines.append("### How To Read The CAA Results")
+    lines.append("")
+    lines.extend(md_table(["CAA Discovery Result", "Names", "Meaning"], caa_source_rows(caa_analysis)))
+    lines.append("")
+    lines.append("The key distinction is between ordinary parent inheritance and alias-target-derived policy. Parent inheritance means the leaf name simply relies on a policy published higher in its own DNS tree. Alias-target-derived policy means the effective CAA surfaced through an alias response. In this corpus, that often marks a managed rail or specialist external platform rather than a plain brand-front hostname.")
+    lines.append("")
+    lines.append("In practical terms, most names in this corpus fall into three shapes: inherited corporate policy, alias-driven managed-platform policy, or no CAA at all. That three-way split is more important than the mechanics themselves, because it shows where issuance control is broad, where it is deliberately narrow, and where it is absent.")
+    lines.append("")
+    lines.append("### Policy Regimes By Configured Zone")
+    lines.append("")
+    for zone in caa_analysis.configured_domains:
+        lines.append(f"#### `{zone}`")
+        lines.append("")
+        lines.extend(md_table(["Policy Regime", "Names", "Plain-Language Meaning"], caa_zone_rows[zone]))
+        lines.append("")
+    if secondary_zone:
+        lines.append(f"The contrast between `{primary_zone}` and `{secondary_zone}` is one of the strongest PKI-governance findings in the corpus. `{primary_zone}` is policy-layered and governed, while `{secondary_zone}` is currently CAA-empty in the scanned name set. That does not make `{secondary_zone}` invalid, but it does mean DNS is not constraining public CA choice there.")
+        lines.append("")
+        lines.append(f"That asymmetry matters more than any one record. `{primary_zone}` looks like a namespace where DNS is being used as an issuance-governance tool. `{secondary_zone}` looks like a namespace where issuance choice is still being handled outside DNS policy, or not being constrained at all.")
+        lines.append("")
+    lines.append("### How CAA Changes The Reading Of The Estate")
+    lines.append("")
+    lines.extend(
+        [
+            "- The CAA layer strengthens the earlier certificate-and-DNS thesis rather than overturning it. The same service families that already looked like shared managed rails from naming and DNS often sit under narrower issuance policy as well.",
+            f"- In `{primary_zone}`, the current CAA friction is concentrated rather than diffuse: {caa_concentration_text(caa_analysis, primary_zone)}.",
+            "- Broad corporate default policy remains visible on many ordinary brand-facing names. That supports the earlier reading that not every public hostname was moved onto one tightly managed delivery rail.",
+            "- Narrower or alias-driven CAA policy appears where the DNS evidence already suggested a managed platform, campaign rail, or vendor-mediated service surface.",
+            "- Vendor-style exceptions still exist. Where a name resolves through a specialist external platform and the allowed CA set widens or changes shape, the policy layer supports the earlier vendor-delegation reading rather than contradicting it.",
+            "- The chapter therefore adds a governance gradient to the earlier thesis: some parts of the estate are tightly steered, some inherit a broad default, and some are still policy-empty.",
+        ]
+    )
+    lines.append("")
+    lines.append("### Why The Next Two Tables Matter")
+    lines.append("")
+    lines.extend(
+        [
+            "- The overlap table shows where an old and a new issuance regime are both still live on the same DNS name.",
+            "- The mismatch table shows where today's DNS policy has already moved, but one or more live certificates still reflect the older state.",
+            "- Read them together, not separately. Together they show whether the estate looks diffusely messy or whether the untidy parts cluster in a small transition zone.",
+        ]
+    )
+    lines.append("")
+    lines.append("### Current Multi-Family Overlap")
+    lines.append("")
+    if caa_analysis.multi_family_overlap_names:
+        lines.extend(md_table(["DNS Name", "Zone", "Live CA Families", "Covering Subject CNs"], top_caa_overlap_rows(caa_analysis)))
+    else:
+        lines.append("No current multi-family overlap names were found.")
+    lines.append("")
+    lines.append("These overlap names are operationally important. They show where the same public DNS name is currently covered by more than one live CA family at once. In this corpus, that behavior clusters tightly in a few service families rather than being spread randomly across the estate.")
+    lines.append("")
+    lines.append("### Current Policy Mismatch")
+    lines.append("")
+    if caa_analysis.policy_mismatch_names:
+        lines.extend(md_table(["DNS Name", "Zone", "Live CA Families", "CAA-Allowed Families", "CAA Discovery Result"], top_caa_mismatch_rows(caa_analysis)))
+    else:
+        lines.append("No current policy-mismatch names were found.")
+    lines.append("")
+    lines.append("A current policy mismatch does not automatically prove CA misissuance. CAA only proves what DNS authorizes now. Certificates can remain valid after the DNS-side policy has changed, so the right reading here is current policy lag or migration residue unless the historical issuance-time DNS can also be shown.")
+    lines.append("")
+    lines.append("Taken together, the overlap and mismatch tables support a migration reading more than a disorder reading. If the estate were simply chaotic, the live friction would be spread widely across unrelated names. Instead, it clusters in a small number of service families that were already prominent in the certificate and DNS chapters.")
+    lines.append("")
+    if focus_analysis:
+        lines.append("## Chapter 8: Focused Subject-CN Cohort")
+        lines.append("")
+        lines.append("**Management Summary**")
+        lines.append("")
+        lines.extend(
+            [
+                f"- The focused cohort contains {focus_analysis.provided_subjects_count} analyst-selected Subject CN values. {focus_analysis.historically_seen_subjects_count} are visible somewhere in the historical CT corpus, and {focus_analysis.current_direct_subjects_count} still have direct current certificates.",
+                f"- The current focused cohort is structurally different from the rest of the estate: all {focus_analysis.current_focus_certificate_count} current focused certificates are Sectigo/COMODO-lineage, compared with {counter_text(focus_analysis.rest_current_issuer_families, 3)} in the rest of the corpus.",
+                f"- The focused cohort uses much smaller certificates: median SAN size {focus_analysis.focus_median_san_entries} versus {focus_analysis.rest_median_san_entries}, and {focus_analysis.focus_multi_zone_certificate_count} current multi-zone certificates versus {focus_analysis.rest_multi_zone_certificate_count} outside the cohort.",
+                f"- Revocation churn is much higher inside the focused cohort: {focus_analysis.focus_revoked_current_count} revoked versus {focus_analysis.focus_not_revoked_current_count} not revoked ({focus_analysis.focus_revoked_share}), compared with {focus_analysis.rest_revoked_current_count} versus {focus_analysis.rest_not_revoked_current_count} ({focus_analysis.rest_revoked_share}) outside the cohort.",
+                f"- Cross-basket carrying is limited rather than universal. The count of focused entries that appear today only as SAN passengers is {focus_analysis.current_carried_only_subjects_count}, and the count ever seen as SAN passengers inside non-focused certificates at all is {focus_analysis.historical_non_focus_carried_subjects_count}.",
+                f"- The cohort splits into three naming buckets rather than one uniform style: {focus_analysis.bucket_counts.get('direct_front_door', 0)} front-door direct names, {focus_analysis.bucket_counts.get('platform_matrix_anchor', 0)} platform-anchor matrix names, and {focus_analysis.bucket_counts.get('ambiguous_legacy', 0)} ambiguous or legacy-residue names.",
+            ]
+        )
+        lines.append("")
+        lines.append("This chapter treats the supplied Subject-CN list as an analyst-guided cohort rather than as a neutral statistical sample. The question is not whether these names are the most common names in the estate. The question is why they were memorable enough to be singled out, and whether the certificate and DNS evidence shows that they belong to a different naming and hosting tradition.")
+        lines.append("")
+        lines.append("The short answer is yes, but not because the cohort is perfectly uniform. The cohort is different from the wider estate because it is weighted toward remembered public fronts and remembered platform anchors, not toward the Amazon-heavy operational rail population that dominates the broader corpus.")
+        lines.append("")
+        lines.append("### Focused Cohort Versus The Rest Of The Estate")
+        lines.append("")
+        lines.extend(md_table(["Comparison View", "Focused Cohort", "Rest Of Current Corpus", "Why It Matters"], focus_comparison))
+        lines.append("")
+        lines.append("### Three Buckets Inside The Cohort")
+        lines.append("")
+        lines.extend(md_table(["Bucket", "Count", "Representative Names", "What It Looks Like", "Why This Bucket Exists"], focus_bucket_summary))
+        lines.append("")
+        lines.append("This bucket split is the key to making the cohort intelligible. The memorable names are not all from one naming methodology. Most are direct public fronts. A very small number are platform-anchor certificates with matrix SAN design. The rest are historical leftovers, carried aliases, or opaque labels whose original role is no longer cleanly visible in the current corpus.")
+        lines.append("")
+        lines.append("### Why This Cohort Feels Different")
+        lines.append("")
+        lines.extend(
+            [
+                "- The dominant bucket is the front-door direct bucket. These are small-SAN certificates attached to memorable service, identity, vendor, or brand-like names directly under the branded public zones configured for the scan.",
+                "- The platform-anchor bucket is tiny but important. These names carry large SAN matrices that spell out environment, tenant, service-cell, or monitoring coverage, which is exactly what one would expect from a centrally managed operational platform slice.",
+                "- The ambiguous bucket matters because it explains the leftover rough edges. These names may be historical-only, partly migrated into other certificates, or too opaque to decode confidently from public evidence alone.",
+                "- The public DNS evidence for the current focused Subject CN names is also different. The cohort lands much more often on direct addresses or simple direct AWS clues, while the wider current Subject-CN population is much more dominated by Adobe-managed, Apigee-managed, or NXDOMAIN outcomes.",
+                "- Historical red flags are common in the cohort, but they are mostly past rather than current. That is consistent with a legacy or manually managed public-web slice that has been cleaned up over time rather than with a currently chaotic platform core.",
+            ]
+        )
+        lines.append("")
+        lines.append("Seen this way, the cohort makes sense. It looks like a remembered estate made of two high-visibility extremes: public-facing service fronts that humans remember because customers and staff encounter them directly, and a small number of operational anchor names that humans remember because administrators, testers, or engineers encounter them repeatedly. The ambiguous bucket is the residue between those two poles.")
+        lines.append("")
+        lines.append("### Cross-Basket Carrying And Migration")
+        lines.append("")
+        if focus_analysis.transition_rows:
+            lines.extend(
+                md_table(
+                    ["Subject CN", "Current Basket Status", "Direct/Carried", "Max Direct-To-Carrier Overlap", "Carrier Subjects"],
+                    [
+                        [
+                            detail.subject_cn,
+                            detail.basket_status,
+                            f"{detail.current_direct_certificates}/{detail.current_non_focus_san_carriers + detail.historical_non_focus_san_carriers}",
+                            str(detail.max_direct_to_carrier_overlap_days),
+                            truncate_text(detail.carrier_subjects, 48),
+                        ]
+                        for detail in focus_analysis.transition_rows[:10]
+                    ],
+                )
+            )
+        else:
+            lines.append("No focused names were seen as SAN passengers inside non-focused certificates.")
+        lines.append("")
+        lines.append("This migration table answers a narrower question than the rest of the chapter. It asks whether these names were gradually absorbed into broader certificates from outside the cohort. The answer is: only in a limited number of cases. Some names do show SAN-passenger behavior or historical carrying, but that is not the dominant explanation for why the cohort feels different. The dominant explanation is the bucket split above: many remembered direct fronts, a few large platform anchors, and a band of legacy residue.")
+        lines.append("")
+        lines.append("### Representative Names By Bucket")
+        lines.append("")
+        lines.extend(md_table(["Bucket", "Subject CN", "Observed Role", "Current/Historical Direct", "Why It Helps Explain The Bucket"], focus_representatives))
+        lines.append("")
+        lines.append("These examples are evidence anchors, not the whole population. The direct-front examples show the remembered public surface. The platform-anchor examples show the rare but important matrix certificates. The ambiguous examples show why the cohort cannot be reduced to a single neat story without losing the migration and legacy residue that made these names memorable in the first place.")
+        lines.append("")
+    lines.append(f"## Chapter {synthesis_chapter}: Making The Whole Estate Make Sense")
+    lines.append("")
+    lines.append("**Management Summary**")
+    lines.append("")
+    lines.extend(
+        [
+            "- The certificate, DNS, and CAA layers are not three separate stories. They are three views of the same operating estate.",
+            "- Naming shows role and organisational memory; DNS shows where traffic lands; CAA shows how tightly issuance is governed.",
+            "- Clean public brand names usually sit closest to the customer surface, while dense SAN sets, numbered families, multi-zone certificates, and narrower CAA policy usually expose the shared platform layer beneath them.",
+            "- When the layers disagree, the disagreement usually signals migration or uneven governance maturity rather than a flat contradiction.",
+            "- The overall shape is more consistent with a federated operating model with uneven governance maturity than with random hostname sprawl.",
+        ]
+    )
+    lines.append("")
+    lines.append("The common ground is operational reality. A branded proposition wants recognisable names. A service team wants a stable endpoint namespace. A platform team wants shared rails and repeatable delivery machinery. A hosting team wants routable front doors that can land on cloud distribution, gateways, or workflow platforms. A security or PKI function wants some names tightly governed and other names left broad or delegated. Certificates, DNS, and CAA tell the same estate story from different angles.")
+    lines.append("")
+    lines.append("A useful way to combine the layers is to ask four questions in order. First, what does the name itself look like: a direct front door, a numbered rail, an environment slice, or a bridge across business zones? Second, how broad is the SAN set: is this one visible service or a bundled platform certificate? Third, where does public DNS actually land the name: direct host, CDN edge, API gateway, campaign rail, or specialist platform? Fourth, does DNS issuance policy stay broad, narrow sharply, or disappear entirely?")
+    lines.append("")
+    lines.append("When those answers align, the reading becomes strong. A small-SAN branded name with ordinary inherited policy reads like a direct public front. A dense multi-zone certificate with numbered families, managed DNS landing, and narrower CAA reads like a shared operational rail. A name that lands on AWS but still uses a Sectigo-family certificate shows that hosting choice and CA choice are separate decisions. A name with current overlap and current policy mismatch shows a transition area where the newer issuance model is already in place but the older certificate state has not fully disappeared.")
+    lines.append("")
+    lines.append("This is why the estate can look both tidy and messy at once. It is tidy within each layer, but messy across layers because the layers are solving different problems. The new CAA evidence sharpens that point rather than contradicting it: the managed rail families are not only named and hosted differently, they are often policy-controlled differently as well. The biggest qualification is that governance is uneven. The primary configured zone shows layered issuance control, while another configured zone remains CAA-empty. That is not random chaos, but it is also not uniform control maturity.")
+    lines.append("")
+    lines.append(f"## Chapter {limits_chapter}: Limits, Confidence, and Noise")
+    lines.append("")
+    lines.append("**Management Summary**")
+    lines.append("")
+    lines.extend(
+        [
+            "- High-confidence claims are the ones tied directly to observable certificate fields, DNS answers, trust records, and current CAA policy.",
+            "- Medium-confidence claims are organisational readings drawn from repeated technical patterns.",
+            "- Lower-confidence claims are exact expansions of abbreviations or exact internal ownership boundaries.",
+            "- Some DNS names do not resolve publicly today; that does not invalidate the certificate-side evidence because certificate and DNS timelines are not identical.",
+            "- A current CAA mismatch does not by itself prove historical CA non-compliance, because DNS policy may have changed after issuance.",
+        ]
+    )
+    lines.append("")
+    lines.append("A useful way to read the corpus is to separate signal from noise. Repeated naming schemas are signal. Repeated DNS outcomes are signal. Which public CA family keeps issuing a name is signal. Where CAA is broad, narrow, delegated, or absent is signal. Simple `www` presence or absence is weak evidence either way unless it coincides with stronger differences such as distinct DNS routing, distinct SAN composition, a distinct certificate renewal history, or a distinct issuance-policy shape.")
+    lines.append("")
+    lines.append("## Appendix A: Full Family Catalogue")
+    lines.append("")
+    lines.append("This appendix is a compact family map. It is not the place for full per-certificate evidence; that remains in the detailed inventory appendix at the end.")
+    lines.append("")
+    lines.extend(md_table(["Family Basis", "Certs", "CNs", "Dominant Stack"], family_rows))
+    lines.append("")
+    lines.append("## Appendix B: Historical Red-Flag Detail")
+    lines.append("")
+    lines.append("This appendix keeps the detailed historical evidence inside the monograph so that the reader does not need a second report. Each subsection answers one narrow question. If a column does not help answer that question, it has been removed.")
+    lines.append("")
+    lines.append("In this appendix, a *renewal family* means repeated certificates that keep the same apparent identity over time: the same Subject CN, the same full Subject DN, the same SAN profile, and the same CA family.")
+    lines.append("")
+    lines.append("### B.1 Current Red-Flag Inventory")
+    lines.append("")
+    if assessment.current_red_flag_rows:
+        lines.extend(
+            md_table(
+                ["Subject CN", "Live Certs", "Current Concern", "Supporting Context"],
+                [
+                    [
+                        row.subject_cn,
+                        str(row.current_certificate_count),
+                        row.flags,
+                        truncate_text(row.notes, 84),
+                    ]
+                    for row in assessment.current_red_flag_rows
+                ],
+            )
+        )
+    else:
+        lines.append("No current red flags were found.")
+    lines.append("")
+    lines.append("### B.2 Past Red-Flag Inventory Now Fixed")
+    lines.append("")
+    if assessment.past_red_flag_rows:
+        lines.extend(
+            md_table(
+                ["Subject CN", "Historic Certs", "Historical Concern", "Supporting Context"],
+                [
+                    [
+                        row.subject_cn,
+                        str(row.certificate_count),
+                        row.flags,
+                        truncate_text(row.notes, 84),
+                    ]
+                    for row in assessment.past_red_flag_rows
+                ],
+            )
+        )
+    else:
+        lines.append("No past-only red flags were found.")
+    lines.append("")
+    lines.append("### B.3 Current Overlap Red Flags")
+    lines.append("")
+    if assessment.overlap_current_rows:
+        lines.extend(
+            md_table(
+                ["Subject CN", "Max Overlap Days", "Live Certs", "What The Renewal Family Looks Like"],
+                [
+                    [
+                        row.subject_cn,
+                        str(row.max_overlap_days),
+                        str(row.current_certificate_count),
+                        f"{row.lineage}; {overlap_signal(row.details)}",
+                    ]
+                    for row in assessment.overlap_current_rows
+                ],
+            )
+        )
+    else:
+        lines.append("No current overlap red flags were found.")
+    lines.append("")
+    lines.append("### B.4 Past Overlap Red Flags Now Fixed")
+    lines.append("")
+    if assessment.overlap_past_rows:
+        lines.extend(
+            md_table(
+                ["Subject CN", "Max Overlap Days", "Historic Certs", "What The Renewal Family Looks Like"],
+                [
+                    [
+                        row.subject_cn,
+                        str(row.max_overlap_days),
+                        str(row.asset_variant_count),
+                        f"{row.lineage}; {overlap_signal(row.details)}",
+                    ]
+                    for row in assessment.overlap_past_rows
+                ],
+            )
+        )
+    else:
+        lines.append("No past overlap red flags were found.")
+    lines.append("")
+    lines.append("### B.5 Current Subject DN Drift")
+    lines.append("")
+    if assessment.dn_current_rows:
+        lines.extend(
+            md_table(
+                ["Subject CN", "Distinct Subject DNs", "Live Certs", "Subject DN Samples"],
+                [
+                    [
+                        row.subject_cn,
+                        str(row.distinct_value_count),
+                        str(row.current_certificate_count),
+                        truncate_text(row.details, 92),
+                    ]
+                    for row in assessment.dn_current_rows
+                ],
+            )
+        )
+    else:
+        lines.append("No current Subject DN drift was found.")
+    lines.append("")
+    lines.append("### B.6 Past Subject DN Drift Now Fixed")
+    lines.append("")
+    if assessment.dn_past_rows:
+        lines.extend(
+            md_table(
+                ["Subject CN", "Distinct Subject DNs", "Historic Certs", "Subject DN Samples"],
+                [
+                    [
+                        row.subject_cn,
+                        str(row.distinct_value_count),
+                        str(row.certificate_count),
+                        truncate_text(row.details, 92),
+                    ]
+                    for row in assessment.dn_past_rows
+                ],
+            )
+        )
+    else:
+        lines.append("No past-only Subject DN drift was found.")
+    lines.append("")
+    lines.append("### B.7 Current CA-Family Drift")
+    lines.append("")
+    if assessment.vendor_current_rows:
+        lines.extend(
+            md_table(
+                ["Subject CN", "Distinct CA Families", "Live Certs", "CA Families Seen"],
+                [
+                    [
+                        row.subject_cn,
+                        str(row.distinct_value_count),
+                        str(row.current_certificate_count),
+                        truncate_text(row.details, 92),
+                    ]
+                    for row in assessment.vendor_current_rows
+                ],
+            )
+        )
+    else:
+        lines.append("No current CA-family drift was found.")
+    lines.append("")
+    lines.append("### B.8 Past CA-Family Drift Now Fixed")
+    lines.append("")
+    if assessment.vendor_past_rows:
+        lines.extend(
+            md_table(
+                ["Subject CN", "Distinct CA Families", "Historic Certs", "CA Families Seen"],
+                [
+                    [
+                        row.subject_cn,
+                        str(row.distinct_value_count),
+                        str(row.certificate_count),
+                        truncate_text(row.details, 92),
+                    ]
+                    for row in assessment.vendor_past_rows
+                ],
+            )
+        )
+    else:
+        lines.append("No past-only CA-family drift was found.")
+    lines.append("")
+    lines.append("### B.9 Current SAN Drift")
+    lines.append("")
+    if assessment.san_current_rows:
+        lines.extend(
+            md_table(
+                ["Subject CN", "SAN Profiles", "Live Certs", "Delta Pattern", "Representative Delta"],
+                [
+                    [
+                        row.subject_cn,
+                        str(row.distinct_san_profiles),
+                        str(row.current_certificate_count),
+                        row.delta_pattern,
+                        truncate_text(row.representative_delta, 92),
+                    ]
+                    for row in assessment.san_current_rows
+                ],
+            )
+        )
+    else:
+        lines.append("No current SAN drift was found.")
+    lines.append("")
+    lines.append("### B.10 Past SAN Drift Now Fixed")
+    lines.append("")
+    if assessment.san_past_rows:
+        lines.extend(
+            md_table(
+                ["Subject CN", "SAN Profiles", "Historic Certs", "Delta Pattern", "Representative Delta"],
+                [
+                    [
+                        row.subject_cn,
+                        str(row.distinct_san_profiles),
+                        str(row.certificate_count),
+                        row.delta_pattern,
+                        truncate_text(row.representative_delta, 92),
+                    ]
+                    for row in assessment.san_past_rows
+                ],
+            )
+        )
+    else:
+        lines.append("No past-only SAN drift was found.")
+    lines.append("")
+    lines.append("### B.11 Historic Start Dates")
+    lines.append("")
+    lines.extend(
+        md_table(
+            ["Start Day", "Certificates", "Dominant Driver"],
+            [[row.start_day, str(row.certificate_count), driver_summary(row.top_subjects, row.top_issuers)] for row in assessment.day_rows],
+        )
+    )
+    lines.append("")
+    lines.append("### B.12 Historic Step Weeks")
+    lines.append("")
+    if assessment.week_rows:
+        lines.extend(
+            md_table(
+                ["Week Start", "Certificates", "Prior 8-Week Avg", "Dominant Driver"],
+                [
+                    [
+                        row.week_start,
+                        str(row.certificate_count),
+                        row.prior_eight_week_avg,
+                        driver_summary(row.top_subjects, row.top_issuers),
+                    ]
+                    for row in assessment.week_rows
+                ],
+            )
+        )
+    else:
+        lines.append("No step weeks met the threshold.")
+    lines.append("")
+    lines.append(f"## Appendix {caa_appendix}: CAA Policy Detail")
+    lines.append("")
+    lines.append("This appendix keeps the issuance-policy evidence inside the monograph. It answers a narrower question than the DNS appendix: not where a name lands, but which public CA families DNS currently authorizes to issue for that name.")
+    lines.append("")
+    lines.append("### C.1 CAA Discovery Paths")
+    lines.append("")
+    lines.extend(md_table(["CAA Discovery Result", "Names", "Meaning"], caa_source_rows(caa_analysis)))
+    lines.append("")
+    lines.append("### C.2 Policy Regimes By Configured Zone")
+    lines.append("")
+    for zone in caa_analysis.configured_domains:
+        lines.append(f"#### `{zone}`")
+        lines.append("")
+        lines.extend(md_table(["Policy Regime", "Names", "Plain-Language Meaning"], caa_zone_rows[zone]))
+        lines.append("")
+    lines.append("### C.3 Current Multi-Family Overlap")
+    lines.append("")
+    if caa_analysis.multi_family_overlap_names:
+        lines.extend(md_table(["DNS Name", "Zone", "Live CA Families", "Covering Subject CNs"], top_caa_overlap_rows(caa_analysis, 40)))
+    else:
+        lines.append("No current multi-family overlap names were found.")
+    lines.append("")
+    lines.append("### C.4 Current Policy Mismatch")
+    lines.append("")
+    if caa_analysis.policy_mismatch_names:
+        lines.extend(md_table(["DNS Name", "Zone", "Live CA Families", "CAA-Allowed Families", "CAA Discovery Result"], top_caa_mismatch_rows(caa_analysis, 40)))
+    else:
+        lines.append("No current policy-mismatch names were found.")
+    lines.append("")
+    if focus_analysis:
+        lines.append(f"## Appendix {focus_appendix}: Focused Subject-CN Detail")
+        lines.append("")
+        lines.append("This appendix keeps the complete focused-cohort table inside the monograph, but it now follows the three-bucket taxonomy from Chapter 8. That makes it easier to read the cohort as a set of related naming traditions instead of as one flat mixed list.")
+        lines.append("")
+        appendix_buckets = [
+            ("direct_front_door", "### D.1 Front-Door Direct Names"),
+            ("platform_matrix_anchor", "### D.2 Platform-Anchor Matrix Names"),
+            ("ambiguous_legacy", "### D.3 Ambiguous Or Legacy Residue"),
+        ]
+        for bucket, heading in appendix_buckets:
+            rows = focus_appendix_rows(focus_analysis, bucket)
+            lines.append(heading)
+            lines.append("")
+            lines.append(f"{ct_focus_subjects.taxonomy_bucket_label(bucket)} count: {focus_analysis.bucket_counts.get(bucket, 0)}.")
+            lines.append("")
+            if rows:
+                lines.extend(
+                    md_table(
+                        [
+                            "Subject CN",
+                            "Bucket Rationale",
+                            "Analyst Note",
+                            "Observed Role",
+                            "Direct C/H",
+                            "Carried C/H",
+                            "SANs C/H",
+                            "Current DNS Outcome",
+                            "Current Revocation Mix",
+                            "Current Flags",
+                            "Past Flags",
+                        ],
+                        rows,
+                    )
+                )
+            else:
+                lines.append("No subjects fell into this bucket.")
+            lines.append("")
+    lines.append(f"## Appendix {detailed_inventory_appendix}: Detailed Inventory Appendix")
+    lines.append("")
+    lines.append("The full issuer-first family inventory is reproduced below so that the monograph remains complete rather than merely interpretive.")
+    lines.append("")
+    lines.append(appendix_markdown)
+    args.markdown_output.write_text("\n".join(lines) + "\n", encoding="utf-8")
+
+

What this block is doing

Writes the narrative monograph in Markdown.

+

Flow arrows

Current-state facts, history, CAA, and focused-cohort analysis. → render_markdown → Produces the main Markdown monograph.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## render_latex + + + + + + +
+
def render_latex(
+    args: argparse.Namespace,
+    report: dict[str, object],
+    assessment: ct_lineage_report.HistoricalAssessment,
+    caa_analysis: ct_caa_analysis.CaaAnalysis,
+    focus_analysis: ct_focus_subjects.FocusCohortAnalysis | None,
+) -> None:
+    args.latex_output.parent.mkdir(parents=True, exist_ok=True)
+    hits = report["hits"]
+    groups = report["groups"]
+    purpose_summary = report["purpose_summary"]
+    total_certificates = len(report["classifications"])
+    issuer_trust = report["issuer_trust"]
+    issuer_family_rows = build_issuer_family_rows(report)
+    family_rows = [
+        [
+            compact_family_basis(row["basis"]),
+            str(row["certificates"]),
+            str(row["subjects"]),
+            first_list_item(row["top_stacks"]),
+        ]
+        for row in report["group_digest"]
+    ]
+    dual_items = [item for item in report["classifications"] if item.category == "tls_server_and_client"]
+    dual_issuer_counts = Counter(short_issuer(item.issuer_name) for item in dual_items)
+    server_only_count = purpose_summary.category_counts.get("tls_server_only", 0)
+    dual_count = purpose_summary.category_counts.get("tls_server_and_client", 0)
+    server_only_issuer_families = collapse_issuer_counts_by_family(
+        purpose_summary.issuer_breakdown.get("tls_server_only", {})
+    )
+    historical_count = len(assessment.certificates)
+    historical_current_count = sum(1 for item in assessment.certificates if item.current)
+    repeated_cn_count = historical_repeated_cn_count(assessment)
+    purpose_rows = [
+        (
+            purpose_label(category),
+            str(count),
+            pct(count, total_certificates),
+            purpose_meaning(category),
+        )
+        for category, count in [
+            ("tls_server_only", purpose_summary.category_counts.get("tls_server_only", 0)),
+            ("tls_server_and_client", purpose_summary.category_counts.get("tls_server_and_client", 0)),
+            ("client_auth_only", purpose_summary.category_counts.get("client_auth_only", 0)),
+            ("smime_only", purpose_summary.category_counts.get("smime_only", 0)),
+            ("code_signing_only", purpose_summary.category_counts.get("code_signing_only", 0)),
+            ("mixed_or_other", purpose_summary.category_counts.get("mixed_or_other", 0)),
+            ("no_eku", purpose_summary.category_counts.get("no_eku", 0)),
+        ]
+    ]
+    visible_purpose_rows = [(label, count, share, meaning) for label, count, share, meaning in purpose_rows if count != "0"]
+    dns_class_counts = report["dns_class_counts"]
+    alias_to_address_count = dns_class_counts.get("cname_to_address", 0)
+    direct_address_count = dns_class_counts.get("direct_address", 0)
+    nxdomain_count = dns_class_counts.get("nxdomain", 0)
+    dangling_count = dns_class_counts.get("dangling_cname", 0)
+    no_data_count = dns_class_counts.get("no_data", 0)
+    top_dns_patterns = report["dns_stack_counts"].most_common(8)
+    focus_comparison = focus_comparison_rows(focus_analysis) if focus_analysis else []
+    focus_bucket_summary = focus_bucket_summary_rows(focus_analysis) if focus_analysis else []
+    focus_representatives = focus_representative_rows(focus_analysis) if focus_analysis else []
+    has_focus = focus_analysis is not None
+    caa_zone_rows = {
+        zone: caa_zone_policy_rows(caa_analysis, zone)
+        for zone in caa_analysis.configured_domains
+    }
+    primary_zone = report["domains"][0] if report["domains"] else "configured primary zone"
+    secondary_zone = report["domains"][1] if len(report["domains"]) > 1 else None
+    appendix_pdf_path = args.appendix_pdf_output.resolve().as_posix()
+    lines: list[str] = [
+        r"\documentclass[11pt]{article}",
+        r"\usepackage[a4paper,margin=18mm]{geometry}",
+        r"\usepackage{fontspec}",
+        r"\usepackage[table]{xcolor}",
+        r"\usepackage{microtype}",
+        r"\usepackage{hyperref}",
+        r"\usepackage{xurl}",
+        r"\usepackage{array}",
+        r"\usepackage{booktabs}",
+        r"\usepackage{tabularx}",
+        r"\usepackage{longtable}",
+        r"\usepackage{needspace}",
+        r"\usepackage{enumitem}",
+        r"\usepackage{titlesec}",
+        r"\usepackage[most]{tcolorbox}",
+        r"\usepackage{pdfpages}",
+        r"\defaultfontfeatures{Ligatures=TeX,Scale=MatchLowercase}",
+        r"\definecolor{Ink}{HTML}{17202A}",
+        r"\definecolor{Muted}{HTML}{667085}",
+        r"\definecolor{Line}{HTML}{D0D5DD}",
+        r"\definecolor{Panel}{HTML}{F8FAFC}",
+        r"\definecolor{Accent}{HTML}{0F766E}",
+        r"\hypersetup{colorlinks=true,linkcolor=Accent,urlcolor=Accent,pdfauthor={CertTransparencySearch},pdftitle={CT and DNS Monograph}}",
+        r"\setlength{\parindent}{0pt}",
+        r"\setlength{\parskip}{6pt}",
+        r"\setlength{\emergencystretch}{4em}",
+        r"\setlength{\footskip}{24pt}",
+        r"\setlength{\tabcolsep}{4.2pt}",
+        r"\renewcommand{\arraystretch}{1.12}",
+        r"\raggedbottom",
+        r"\setcounter{tocdepth}{2}",
+        r"\pagestyle{plain}",
+        r"\titleformat{\section}{\sffamily\bfseries\LARGE\color{Ink}\raggedright}{\thesection}{0.8em}{}",
+        r"\titleformat{\subsection}{\sffamily\bfseries\Large\color{Ink}\raggedright}{\thesubsection}{0.8em}{}",
+        r"\titleformat{\subsubsection}{\sffamily\bfseries\normalsize\color{Ink}\raggedright}{\thesubsubsection}{0.8em}{}",
+        r"\tcbset{panel/.style={enhanced,breakable,boxrule=0.55pt,arc=3pt,left=9pt,right=9pt,top=8pt,bottom=8pt,colback=white,colframe=Line}}",
+        r"\newcommand{\SummaryBox}[1]{\begin{tcolorbox}[enhanced,boxrule=0.55pt,arc=3pt,left=9pt,right=9pt,top=8pt,bottom=8pt,colback=Panel,colframe=Line]#1\end{tcolorbox}}",
+        r"\newcommand{\SoftSubsection}[1]{\Needspace{12\baselineskip}\subsection{#1}}",
+        r"\newcommand{\SoftSubsubsection}[1]{\Needspace{10\baselineskip}\subsubsection{#1}}",
+        r"\begin{document}",
+        r"\begin{titlepage}",
+        r"\vspace*{16mm}",
+        r"{\sffamily\bfseries\fontsize{24}{28}\selectfont CT and DNS Monograph\par}",
+        r"\vspace{6pt}",
+        r"{\Large A complete publication built from live Certificate Transparency and public DNS evidence\par}",
+        r"\vspace{18pt}",
+        rf"\textbf{{Generated}}: {latex_escape(report['generated_at_utc'])}\par",
+        rf"\textbf{{Configured search terms file}}: {latex_escape(args.domains_file.name)}\par",
+        r"\vspace{12pt}",
+        r"\SummaryBox{"
+        + rf"\textbf{{Headline}}: {len(hits)} leaf certificates, {len(groups)} CN families, "
+        + rf"{historical_count} historical leaf certificates, "
+        + rf"{len(report['unique_dns_names'])} DNS names, "
+        + rf"{purpose_summary.category_counts.get('tls_server_only', 0)} ordinary public TLS server certificates, "
+        + rf"{purpose_summary.category_counts.get('tls_server_and_client', 0)} certificates from templates that also permit client-certificate use."
+        + r"}",
+        r"\end{titlepage}",
+        r"\begingroup",
+        r"\small",
+        r"\setlength{\parskip}{2pt}",
+        r"\tableofcontents",
+        r"\endgroup",
+        r"\clearpage",
+    ]
+
+    def add_summary(items: list[str]) -> None:
+        lines.append(r"\SummaryBox{\textbf{Management Summary}\begin{itemize}[leftmargin=1.4em]")
+        for item in items:
+            lines.append(rf"\item {latex_escape(item)}")
+        lines.append(r"\end{itemize}}")
+
+    lines.append(r"\section*{Executive Summary}")
+    lines.append(r"\addcontentsline{toc}{section}{Executive Summary}")
+    add_summary(
+        [
+            f"{len(hits)} current leaf certificates are in scope on this run.",
+            f"{len(groups)} CN families reduce the estate into readable naming clusters.",
+            f"{purpose_summary.category_counts.get('tls_server_only', 0)} certificates are ordinary public TLS server certificates, while {purpose_summary.category_counts.get('tls_server_and_client', 0)} come from templates that also permit client-certificate use.",
+            f"{historical_count} historical leaf certificates show how the same names evolved over time.",
+            f"{len(report['unique_dns_names'])} DNS SAN names were scanned live.",
+            f"{caa_analysis.total_names} DNS names were also assessed for effective CAA policy, revealing where issuance is centrally governed, delegated, or left unrestricted.",
+            "The estate is best understood as layers of branding, service naming, platform naming, delivery naming, and issuance-policy control rather than as random clutter.",
+        ]
+    )
+    lines.append(
+        r"This document is designed as a complete publication rather than a brief. The main chapters carry the argument and the appendices carry the detailed evidence."
+    )
+
+    lines.append(r"\section*{Reading Guide}")
+    lines.append(r"\addcontentsline{toc}{section}{Reading Guide}")
+    add_summary(
+        [
+            "Chapter 1 proves the corpus and explains why the numbers can be trusted.",
+            "Chapters 2 and 3 explain what the current certificates are and what they are for.",
+            "Chapter 4 explains the historical lifecycle and splits red flags into current versus fixed-in-the-past.",
+            "Chapters 5 and 6 explain naming and DNS delivery.",
+            "Chapter 7 explains the issuance-policy layer: which public CAs DNS currently authorizes and where DNS imposes no restriction at all.",
+            *(
+                ["Chapter 8 explains the focused Subject-CN cohort and why it behaves differently from the wider estate."]
+                if has_focus
+                else []
+            ),
+            "The next synthesis chapter ties the whole estate back to operational reality.",
+            "The appendices contain the detailed catalogue, the historical red-flag detail, and the full inventory.",
+        ]
+    )
+
+    lines.append(r"\section{Scope, Completeness, and Proof}")
+    add_summary(
+        [
+            f"The first broad crt.sh search returned {', '.join(f'{domain}={count} matching index rows' for domain, count in report['raw_match_counts'].items())}. Those rows are leads, not final certificate count.",
+            f"The scanner was allowed to collect up to {report['cap']} candidate rows per search term. Because the live match counts stayed below that limit, nothing was silently cut off.",
+            f"After downloading and parsing the actual certificate bodies, {report['verification'].unique_leaf_certificates} genuine leaf certificates remained. {report['verification'].non_leaf_filtered} CA-style certificates and {report['verification'].precertificate_poison_filtered} precertificate marker objects were rejected.",
+            f"Certificates missing the searched-for domains in their DNS SANs after full parsing: {report['missing_matching_san']}.",
+        ]
+    )
+    lines.append(
+        r"This chapter answers the first and most important question: whether the report is built on a complete and trustworthy corpus. The scanner now checks the live raw match count before issuing the capped query. If the cap is too low, it fails instead of silently undercounting."
+    )
+    lines.append(
+        r"The first crt.sh row count is intentionally larger than the final certificate count because Certificate Transparency search results are index rows, not de-duplicated certificates. The report therefore reads the binary certificate body itself, removes duplicates, rejects CA certificates and precertificate marker objects, and only then builds the working corpus."
+    )
+    lines.append(
+        r"In other words: this publication is not based on search-result snippets alone. It is based on the parsed X.509 certificate bodies."
+    )
+
+    lines.append(r"\section{The Certificate Corpus}")
+    add_summary(
+        [
+            f"Issuer families by certificate count are {', '.join(f'{name} ({count})' for name, count in report['issuer_family_counts'].most_common())}.",
+            f"Revocation state in plain terms: {report['rev_counts'].get('not_revoked', 0)} certificates are not marked revoked, and {report['rev_counts'].get('revoked', 0)} were later marked invalid by their issuing CA before natural expiry.",
+            "For every current certificate, the main Subject CN hostname also appears literally in the DNS SAN list. The headline name on the certificate is therefore one of the real covered hostnames, not a decorative label.",
+            "All visible issuer families in this corpus are currently trusted by the major public browser and operating-system trust stores for ordinary web server use.",
+        ]
+    )
+    lines.append(
+        r"A certificate corpus can look random when viewed as a flat list. It becomes intelligible once you group it by issuer family, Subject CN construction, validity history, and SAN design. That is why the appendices are arranged as families rather than raw rows."
+    )
+    lines.extend(
+        [
+            r"\subsection{Issuer Trust Table}",
+            r"\begin{longtable}{>{\raggedright\arraybackslash}p{0.40\linewidth} >{\raggedleft\arraybackslash}p{0.12\linewidth} >{\raggedleft\arraybackslash}p{0.12\linewidth} >{\raggedleft\arraybackslash}p{0.18\linewidth}}",
+            r"\toprule",
+            r"Issuer Family & Certs & Variants & Major WebPKI \\",
+            r"\midrule",
+        ]
+    )
+    for row in issuer_family_rows:
+        lines.append(
+            rf"{latex_escape(row['family'])} & {row['certificates']} & {row['variant_count']} & {row['major_webpki']} \\"
+    )
+    lines.extend([r"\bottomrule", r"\end{longtable}"])
+    lines.append(
+        r"\textbf{What WebPKI trust means.} A WebPKI-trusted issuer is a certificate authority trusted by mainstream browser and operating-system trust stores for public TLS. That matters because it tells you these certificates are not part of a private PKI hidden inside one organisation. They are intended to be valid in the public Internet trust model."
+    )
+    lines.append(
+        r"This view should answer one question only: how many publicly trusted issuer families are present in the estate. Exact subordinate issuer names are supporting evidence and remain in the detailed inventory appendix."
+    )
+
+    lines.append(r"\section{Intended Purpose of the Certificates}")
+    add_summary(
+        [
+            f"Certificates whose allowed purpose is ordinary server authentication only: {purpose_summary.category_counts.get('tls_server_only', 0)}.",
+            f"Certificates whose policy allows both server use and client-certificate use: {purpose_summary.category_counts.get('tls_server_and_client', 0)}.",
+            "Certificates dedicated only to client identity, email signing, or code signing: 0.",
+        ]
+    )
+    lines.append(
+        r"This chapter addresses a key ambiguity. A certificate can be technically valid for several uses, and the hostname alone does not settle that question. The corpus was therefore assessed from the X.509 usage fields themselves: EKU and KeyUsage."
+    )
+    lines.append(
+        r"Extended Key Usage tells software what a certificate is allowed to do. In plain terms, this is the difference between a website certificate, a client-identity certificate, an email certificate, and a code-signing certificate."
+    )
+    lines.extend(
+        [
+            r"\subsection{Purpose Map}",
+            r"\begin{longtable}{>{\raggedright\arraybackslash}p{0.24\linewidth} >{\raggedleft\arraybackslash}p{0.10\linewidth} >{\raggedleft\arraybackslash}p{0.10\linewidth} >{\raggedright\arraybackslash}p{0.46\linewidth}}",
+            r"\toprule",
+            r"Usage Class & Certs & Share & Meaning \\",
+            r"\midrule",
+        ]
+    )
+    for label, count, share, meaning in visible_purpose_rows:
+        lines.append(
+            rf"{latex_escape(label)} & {count} & {latex_escape(share)} & {latex_escape(meaning)} \\"
+        )
+    lines.extend([r"\bottomrule", r"\end{longtable}"])
+    lines.append(
+        r"This view should answer only what kind of certificates these are. Zero-count categories are removed here because they add noise without changing the conclusion."
+    )
+    lines.append(
+        r"The basic picture is simple: the corpus is overwhelmingly made of ordinary public TLS server certificates, with a smaller minority whose EKU also permits client-certificate use."
+    )
+    lines.append(
+        r"\textbf{Plain-language explanation of the usage categories.} A TLS server certificate is what a website or API presents to a browser, app, or machine client. A server-and-client certificate is one whose policy allows both server use and client-certificate use. That does not automatically mean it is actually used as a client certificate, but it leaves that door open. Client-auth-only certificates are what you would expect for a user, robot, or agent identity in mutual TLS. S/MIME means email signing or encryption. Code-signing means software signing rather than endpoint security."
+    )
+    lines.append(
+        r"The result is clean. This corpus is entirely TLS-capable. There is no evidence of a separate S/MIME or code-signing estate, and there are no client-auth-only certificates."
+    )
+    lines.extend(
+        [
+            r"\subsection{EKU and KeyUsage Templates}",
+            r"At the template level, the corpus is even simpler than the certificate count suggests. Here, a template simply means a repeated combination of usage fields. Only two EKU combinations appear at all, and one KeyUsage pattern dominates almost completely.",
+            r"\begin{longtable}{>{\raggedright\arraybackslash}p{0.58\linewidth} >{\raggedleft\arraybackslash}p{0.14\linewidth} >{\raggedleft\arraybackslash}p{0.14\linewidth}}",
+            r"\toprule",
+            r"EKU Template & Certs & Share \\",
+            r"\midrule",
+        ]
+    )
+    for template, count in purpose_summary.eku_templates.items():
+        lines.append(rf"{latex_escape(template)} & {count} & {latex_escape(pct(count, total_certificates))} \\")
+    lines.extend([r"\bottomrule", r"\end{longtable}"])
+    lines.extend(
+        [
+            r"\begin{longtable}{>{\raggedright\arraybackslash}p{0.58\linewidth} >{\raggedleft\arraybackslash}p{0.14\linewidth} >{\raggedleft\arraybackslash}p{0.14\linewidth}}",
+            r"\toprule",
+            r"KeyUsage Template & Certs & Share \\",
+            r"\midrule",
+        ]
+    )
+    for template, count in purpose_summary.key_usage_templates.items():
+        lines.append(rf"{latex_escape(template)} & {count} & {latex_escape(pct(count, total_certificates))} \\")
+    lines.extend([r"\bottomrule", r"\end{longtable}"])
+    lines.extend(
+        [
+            r"\subsection{The Majority Pattern: Server-Only Public TLS}",
+            rf"Server-only certificates account for {server_only_count} of {total_certificates} certificates, or {latex_escape(pct(server_only_count, total_certificates))} of the corpus.",
+            rf"Server-only validity starts are split between {latex_escape(', '.join(f'{year} ({count})' for year, count in purpose_summary.validity_start_years.get('tls_server_only', {}).items()))}.",
+            rf"Server-only issuer-family concentration is {latex_escape(', '.join(f'{name} ({count})' for name, count in server_only_issuer_families.most_common()))}.",
+            r"This is the normal public WebPKI server-certificate pattern for websites, APIs, and edge service front doors.",
+            r"This majority group is not background noise. It is the main operational reality visible in the scan: public DNS names covered by publicly trusted endpoint certificates.",
+        ]
+    )
+    lines.extend(
+        [
+            r"\subsection{The Minority Pattern: Dual EKU}",
+            rf"In this corpus, {dual_count} certificates carry both \texttt{{serverAuth}} and \texttt{{clientAuth}} in Extended Key Usage. That is {latex_escape(pct(dual_count, total_certificates))} of the corpus. This means the certificate is \emph{{allowed}} to be used in either role. It does not prove that the certificate is actually being used as a client identity in production.",
+            rf"The dual-EKU group is concentrated in these issuer families: {latex_escape(', '.join(f'{name} ({count})' for name, count in dual_issuer_counts.most_common()))}.",
+            rf"{len(purpose_summary.dual_eku_subject_cns_with_server_only_sibling)} dual-EKU Subject-CN families also have a strict server-only sibling, while {len(purpose_summary.dual_eku_subject_cns_without_server_only_sibling)} currently appear only in the dual-EKU group.",
+            rf"Dual-EKU validity starts are split between {latex_escape(', '.join(f'{year} ({count})' for year, count in purpose_summary.validity_start_years.get('tls_server_and_client', {}).items()))}.",
+            r"The important interpretation point is that these still look like public hostname certificates: DNS-style Subject CN values, DNS SAN lists, and public WebPKI issuers. The better reading is therefore not ``separate client-certificate estate'', but ``server certificates issued from a template that also allowed clientAuth''.",
+            r"\subsection{What Is Not Present}",
+            r"There are no client-auth-only certificates, no S/MIME certificates, no code-signing certificates, no mixed-or-other EKU combinations, and no certificates missing EKU entirely.",
+        ]
+    )
+
+    lines.append(r"\section{Historical Renewal, Drift, and Red Flags}")
+    add_summary(
+        [
+            f"Looking across expired and current history, the corpus contains {historical_count} leaf certificates; {historical_current_count} of them are still valid today.",
+            f"{repeated_cn_count} Subject CN values recur over time rather than appearing as one-off singletons.",
+            f"{assessment.normal_reissuance_assets} renewal families look operationally normal: predecessor and successor overlap for fewer than 50 days.",
+            f"{len(assessment.overlap_current_rows)} names still show long overlap of 50 days or more today.",
+            f"{len(assessment.overlap_past_rows)} names showed the same long-overlap behaviour in the past, but not anymore in currently valid certificates.",
+            f"Current non-overlap anomalies are limited: {len(assessment.dn_current_rows)} live Subject DN drift cases, {len(assessment.vendor_current_rows)} live CA-family drift cases, and {len(assessment.san_current_rows)} live SAN drift cases.",
+            f"Past-only fixed anomalies were broader: {len(assessment.dn_past_rows)} historical Subject DN drift cases, {len(assessment.vendor_past_rows)} historical CA-family drift cases, and {len(assessment.san_past_rows)} historical SAN drift cases.",
+        ]
+    )
+    lines.append(
+        r"This chapter is the historical check on whether the current picture follows a clean renewal pattern. It answers a different question from the current-corpus chapters above: not just what certificates exist now, but how the hostname estate has behaved over time."
+    )
+    lines.append(
+        r"For this chapter, a renewal family means repeated certificates that keep the same apparent identity over time: the same Subject CN, the same full Subject DN, the same SAN profile, and the same CA family. A normal renewal reissues that same apparent certificate identity with a new key and a new validity span, and predecessor and successor overlap only briefly. In this monograph, anything below fifty days of overlap is treated as normal. Fifty days or more is treated as a red flag. COMODO and Sectigo are treated as one CA family from the outset, so movement between those names is not counted here as CA-family drift."
+    )
+    lines.append(
+        r"A red flag in this chapter is not the same thing as a breach or a compromise. It means the certificate history diverged from the clean rollover pattern that one would normally expect and therefore deserves closer review."
+    )
+    lines.extend(
+        [
+            r"\subsection{Current Red-Flag Inventory}",
+        ]
+    )
+    if assessment.current_red_flag_rows:
+        lines.extend(
+            [
+                r"\begin{longtable}{>{\raggedright\arraybackslash}p{0.27\linewidth} >{\raggedleft\arraybackslash}p{0.10\linewidth} >{\raggedright\arraybackslash}p{0.24\linewidth} >{\raggedright\arraybackslash}p{0.29\linewidth}}",
+                r"\toprule",
+                r"Subject CN & Live Certs & Current Concern & Immediate Supporting Context \\",
+                r"\midrule",
+            ]
+        )
+        for row in assessment.current_red_flag_rows[:25]:
+            lines.append(
+                rf"{latex_escape(row.subject_cn)} & {row.current_certificate_count} & {latex_escape(row.flags)} & {latex_escape(truncate_text(row.notes, 72))} \\"
+            )
+        lines.extend([r"\bottomrule", r"\end{longtable}"])
+    else:
+        lines.append(r"No current red flags were found under the configured rules.")
+    lines.append(r"\subsection{Past Red Flags Now Fixed}")
+    if assessment.past_red_flag_rows:
+        lines.extend(
+            [
+                r"\begin{longtable}{>{\raggedright\arraybackslash}p{0.27\linewidth} >{\raggedleft\arraybackslash}p{0.10\linewidth} >{\raggedright\arraybackslash}p{0.24\linewidth} >{\raggedright\arraybackslash}p{0.29\linewidth}}",
+                r"\toprule",
+                r"Subject CN & Historic Certs & Historical Concern & Immediate Supporting Context \\",
+                r"\midrule",
+            ]
+        )
+        for row in assessment.past_red_flag_rows[:25]:
+            lines.append(
+                rf"{latex_escape(row.subject_cn)} & {row.certificate_count} & {latex_escape(row.flags)} & {latex_escape(truncate_text(row.notes, 72))} \\"
+            )
+        lines.extend([r"\bottomrule", r"\end{longtable}"])
+    else:
+        lines.append(r"No past-only red flags were found under the configured rules.")
+    lines.extend(
+        [
+            r"\subsection{What The Historical Red Flags Mean}",
+            r"The two short tables above are screening tables. They answer which names deserve attention now and which names used to be problematic but no longer look live. The appendix below keeps the narrower evidence tables that explain why each name appears here.",
+            rf"Overlap red flags mean predecessor and successor certificates inside the same renewal family coexist for fifty days or more. Current cases: {len(assessment.overlap_current_rows)}. Past-only fixed cases: {len(assessment.overlap_past_rows)}.",
+            rf"Subject-DN drift means the same Subject CN appears under more than one full Subject DN. In plain terms, the headline hostname is being issued under different formal subject identities. Current cases: {len(assessment.dn_current_rows)}. Past-only fixed cases: {len(assessment.dn_past_rows)}.",
+            rf"CA-family drift means the same Subject CN appears under more than one CA family after collapsing COMODO and Sectigo together. Current cases: {len(assessment.vendor_current_rows)}. Past-only fixed cases: {len(assessment.vendor_past_rows)}.",
+            rf"SAN drift means the same Subject CN appears with more than one SAN profile. In plain terms, the hostname keeps being bundled with different companion names. Current cases: {len(assessment.san_current_rows)}. Past-only fixed cases: {len(assessment.san_past_rows)}.",
+            rf"Exact issuer-name changes also exist for {len(assessment.issuer_rows)} Subject CN values, but these are supporting context rather than first-order red flags.",
+            r"\subsection{Historical Step Changes}",
+            rf"Top issuance start dates are {latex_escape(', '.join(f'{row.start_day} ({row.certificate_count})' for row in assessment.day_rows[:6]))}.",
+            rf"Strong step weeks are {latex_escape(', '.join(f'{row.week_start} ({row.certificate_count} vs prior avg {row.prior_eight_week_avg})' for row in assessment.week_rows[:4]) or 'none')}.",
+        ]
+    )
+
+    lines.append(r"\section{Naming Architecture}")
+    add_summary(
+        [
+            f"Numbered CN families: {len(report['numbered_groups'])}.",
+            f"Multi-zone SAN sets: {report['multi_zone_hit_count']}.",
+            f"Frequent naming tokens are {', '.join(f'{token} ({count})' for token, count in report['top_env_tokens'][:8])}.",
+            "The strongest naming signals come from numbered rails, environment markers, cross-brand labels, and cross-zone SAN composition. www is weak evidence either way.",
+        ]
+    )
+    lines.append(
+        r"The naming regime becomes intelligible when read as several superimposed languages: brand language, service language, environment language, platform language, and migration residue."
+    )
+    lines.extend(
+        [
+            r"\subsection{How To Read The Names}",
+            r"\begin{itemize}[leftmargin=1.4em]",
+            r"\item In most of these names, the left-most label tells you the endpoint role, node slot, or environment slice, while the zone on the right tells you which public namespace the service is answering under.",
+            r"\item Standard delivery shorthand appears throughout the corpus: \texttt{dev}, \texttt{qa}, \texttt{uat}, \texttt{sit}, \texttt{stg}, \texttt{preprod}, and \texttt{prod} are ordinary environment markers rather than mysterious product names.",
+            r"\item \texttt{www} is a weak signal both when present and when absent. Its presence often reflects compatibility, redirect history, or old web conventions; its absence does not imply any deeper architectural distinction.",
+            r"\item In this corpus, \texttt{nwg} reads as NatWest Group shorthand. Names like \texttt{rbs}, \texttt{natwest}, \texttt{ulsterbank}, \texttt{lombard}, \texttt{natwestpayments}, \texttt{coutts}, and \texttt{nwgwealth} are best read as parallel business or service namespaces within a wider shared estate, not as random unrelated domains.",
+            r"\item Some short forms remain inferential rather than provable. For example, \texttt{nft} clearly behaves like a non-production stage label, but Certificate Transparency alone cannot prove the local expansion used inside the company.",
+            r"\end{itemize}",
+        ]
+    )
+    lines.append(r"\subsection{Key Pattern Examples}")
+    lines.append(
+        r"These four boxes are not four isolated hostnames. Each one uses a concrete Subject-CN value as the evidence anchor for a broader naming methodology that appears elsewhere in the estate as well."
+    )
+    for example in report["examples"]:
+        lines.append(r"\SummaryBox{")
+        lines.append(rf"\textbf{{{latex_escape(example.title)}}}\par")
+        lines.append(rf"\textbf{{Pattern shown}}: {latex_escape(example_pattern_label(example.title))}\par")
+        lines.append(rf"\textbf{{Concrete example}}: \texttt{{{latex_escape(example.subject_cn)}}}\par")
+        lines.append(rf"\textbf{{What this proves}}: {latex_escape(example.why_it_matters)}\par")
+        lines.append(r"\begin{itemize}[leftmargin=1.4em]")
+        for point in example.evidence:
+            lines.append(rf"\item {latex_escape(point)}")
+        lines.append(r"\end{itemize}}")
+    lines.extend(
+        [
+            r"\subsection{Why These Four Examples}",
+            r"Taken together, these four examples explain most of the naming behaviour in the corpus. The first shows platform fleet naming, the second shows environment-and-release naming, the third shows cross-brand namespace splicing and migration residue, and the fourth shows shared-service bridging across several business namespaces.",
+        ]
+    )
+
+    lines.append(r"\section{DNS Delivery Architecture}")
+    add_summary(
+        [
+            f"Most names resolve by first aliasing to another hostname and then to an address: {alias_to_address_count} public names follow an alias chain, while {direct_address_count} names resolve straight to an address.",
+            f"The most common public DNS outcomes are Adobe Campaign in front of AWS load-balancing ({report['dns_stack_counts'].get('Adobe Campaign -> AWS ALB', 0)}), Adobe Campaign in front of AWS CloudFront ({report['dns_stack_counts'].get('Adobe Campaign -> AWS CloudFront', 0)}), and plain AWS CloudFront without an Adobe layer ({report['dns_stack_counts'].get('AWS CloudFront', 0)}).",
+            f"Smaller but important subsets look like governed API fronts or specialist application platforms: Google Apigee ({report['dns_stack_counts'].get('Google Apigee', 0)}) and Pega Cloud -> AWS ALB ({report['dns_stack_counts'].get('Pega Cloud -> AWS ALB', 0)}).",
+            f"Some certificate names do not lead to a live public endpoint today: {nxdomain_count} do not exist in public DNS at all, {dangling_count} still exist only as broken aliases, and {no_data_count} exist in DNS but returned no public A or AAAA address during the scan.",
+        ]
+    )
+    lines.append(
+        r"DNS is the public routing layer. It does not tell you everything about an application, but it does tell you where a public name lands: directly on an IP, through an alias chain, through a CDN, through an API gateway, or onto a specialist platform."
+    )
+    lines.append(
+        r"This chapter does not claim to know the full private architecture behind each service. It only claims what the public DNS trail supports. For each DNS SAN name in the certificate corpus, the scanner queried public \texttt{CNAME}, \texttt{A}, \texttt{AAAA}, and \texttt{PTR} data. It then summarized that public answer trail with a short label. Those labels are compact descriptions of the public DNS evidence, not arbitrary platform slogans."
+    )
+    lines.append(
+        r"One important caution follows from that last point: a hostname can remain visible in certificate history even after its public DNS has been removed or partially dismantled. Certificate history and current DNS are related, but they do not move in lockstep."
+    )
+    lines.extend(
+        [
+            r"\subsection{How The DNS Evidence Is Read}",
+            r"\begin{itemize}[leftmargin=1.4em]",
+            r"\item A \texttt{CNAME} shows that one public name is really an alias for another public name.",
+            r"\item The terminal hostname, returned addresses, and reverse-DNS names often reveal platform clues such as \texttt{cloudfront.net}, \texttt{elb.amazonaws.com}, \texttt{apigee.net}, or \texttt{campaign.adobe.com}.",
+            r"\item The report combines the answer shape and those clues into one short description. For example, ``Adobe Campaign -> AWS ALB'' means the alias chain contains Adobe Campaign naming and the terminal clues point to AWS load-balancing infrastructure.",
+            r"\item These labels are therefore evidence summaries, not claims of legal ownership or full internal design.",
+            r"\end{itemize}",
+            r"\subsection{What The Public DNS Names Resolve To}",
+            r"\begin{longtable}{>{\raggedright\arraybackslash}p{0.28\linewidth} >{\raggedleft\arraybackslash}p{0.09\linewidth} >{\raggedright\arraybackslash}p{0.51\linewidth}}",
+            r"\toprule",
+            r"Observed DNS Outcome & Count & Plain-Language Meaning \\",
+            r"\midrule",
+        ]
+    )
+    for label, count in top_dns_patterns:
+        lines.append(rf"{latex_escape(label)} & {count} & {latex_escape(delivery_pattern_meaning(label))} \\")
+    lines.extend([r"\bottomrule", r"\end{longtable}"])
+    lines.extend(
+        [
+            r"\subsection{Why Each DNS Label Was Used}",
+            r"\begin{itemize}[leftmargin=1.4em]",
+        ]
+    )
+    for label, _count in top_dns_patterns[:6]:
+        lines.append(rf"\item \textbf{{{latex_escape(label)}}}: {latex_escape(delivery_pattern_rule(label))}")
+    lines.extend(
+        [
+            r"\end{itemize}",
+            r"\subsection{Platform And DNS Glossary}",
+            r"\begin{longtable}{>{\raggedright\arraybackslash}p{0.22\linewidth} >{\raggedright\arraybackslash}p{0.70\linewidth}}",
+            r"\toprule",
+            r"Term & Explanation \\",
+            r"\midrule",
+        ]
+    )
+    glossary = ct_dns_utils.provider_explanations()
+    for term in ["Adobe Campaign", "AWS", "AWS ALB", "AWS CloudFront", "Google Apigee", "Pega Cloud", "Microsoft Edge", "Infinite / agency alias", "CNAME", "A record", "AAAA record", "PTR record", "NXDOMAIN"]:
+        lines.append(rf"{latex_escape(term)} & {latex_escape(glossary[term])} \\")
+    lines.extend([r"\bottomrule", r"\end{longtable}"])
+    lines.append(
+        r"The glossary terms above are the building blocks used in the DNS-outcome table. This is also why the management summary mentions Adobe Campaign, CloudFront, Apigee, and Pega at all: not because brand names are the point, but because those names reveal what kind of public delivery role a hostname is landing on. CloudFront suggests a distribution edge, Apigee suggests managed API exposure, Adobe Campaign suggests a marketing or communications front, and a load balancer suggests traffic distribution to backend services."
+    )
+    lines.append(
+        r"The next chapter stays with the same names but moves from delivery to control. This chapter asked where public traffic lands. The next one asks which public CA families DNS currently authorizes to issue for those same names."
+    )
+
+    lines.append(r"\section{DNS Issuance Policy Control (CAA)}")
+    zone_summary_items: list[str] = []
+    for zone in caa_analysis.configured_domains:
+        zone_rows = ct_caa_analysis.rows_for_zone(caa_analysis, zone)
+        unrestricted_count = sum(1 for row in zone_rows if not row.allowed_ca_families)
+        mismatch_count = sum(1 for row in zone_rows if row.current_policy_mismatch)
+        overlap_count = sum(1 for row in zone_rows if row.current_multi_family_overlap)
+        dominant_policy = ct_caa_analysis.policy_counter(zone_rows).most_common(1)
+        dominant_label = caa_policy_label(dominant_policy[0][0]) if dominant_policy else "none"
+        zone_summary_items.append(
+            f"{zone}: {len(zone_rows)} names in scope; dominant policy is {dominant_label}; unrestricted names={unrestricted_count}; current policy-mismatch names={mismatch_count}; current multi-family overlap names={overlap_count}."
+        )
+    add_summary(
+        zone_summary_items
+        + [
+            f"Effective CAA discovery paths across all names are {', '.join(f'{caa_source_label(kind)}={count}' for kind, count in caa_analysis.source_kind_counts.most_common())}.",
+            f"Current names simultaneously covered by more than one live CA family: {len(caa_analysis.multi_family_overlap_names)}.",
+            f"Current names whose live certificate family does not match today's published CAA policy: {len(caa_analysis.policy_mismatch_names)}.",
+        ]
+    )
+    lines.append(
+        r"CAA is the DNS control layer for public certificate issuance. It does not validate a certificate after issuance; instead, it tells a public CA which CA families are authorized to issue for a DNS name if any restriction is published at all. If no CAA is published, WebPKI issuance is unrestricted from the DNS-policy point of view."
+    )
+    lines.append(
+        r"This chapter is the control-plane counterpart to the certificate and DNS chapters. The certificate chapter showed who actually issued. The DNS chapter showed where the names land. The CAA chapter shows which issuers the DNS owner currently allows for those same names."
+    )
+    lines.append(
+        r"That distinction matters because hosting and issuance are different decisions. A name can land on AWS and still use a Sectigo-family certificate if DNS policy allows it. A name can also resolve through a vendor platform while still inheriting a first-party corporate CAA policy. The point of this chapter is to show where those decisions line up and where they do not."
+    )
+    lines.append(
+        r"CAA is checked per DNS name requested in the certificate, not per Subject DN and not per organisational story. A Subject CN can therefore shift between different Subject DN values without creating a CAA clash, because CAA ignores organisation fields and looks only at the DNS names being certified."
+    )
+    lines.extend(
+        [
+            r"\subsection{Why CAA Matters In This Estate}",
+            r"\begin{itemize}[leftmargin=1.4em]",
+            r"\item If a name has no CAA, DNS is not constraining which public CA family may issue for it.",
+            r"\item If a name inherits a broad corporate policy, that usually means the organisation has left normal brand-facing names under a common default.",
+            r"\item If a name falls under a narrower subtree or alias-derived policy, that is evidence of more deliberate platform or vendor-specific issuance control.",
+            r"\item If a live certificate family sits outside today's CAA policy, or if the same DNS name is live under two CA families at once, that usually points to migration lag, overlapping rollout, or policy that moved faster than certificate cleanup.",
+            r"\end{itemize}",
+        ]
+    )
+    lines.extend(
+        [
+            r"\subsection{How To Read The CAA Results}",
+            r"\begin{longtable}{>{\raggedright\arraybackslash}p{0.24\linewidth} >{\raggedleft\arraybackslash}p{0.10\linewidth} >{\raggedright\arraybackslash}p{0.54\linewidth}}",
+            r"\toprule",
+            r"CAA Discovery Result & Names & Meaning \\",
+            r"\midrule",
+        ]
+    )
+    for label, count, meaning in caa_source_rows(caa_analysis):
+        lines.append(rf"{latex_escape(label)} & {latex_escape(count)} & {latex_escape(meaning)} \\")
+    lines.extend([r"\bottomrule", r"\end{longtable}"])
+    lines.append(
+        r"The key distinction is between ordinary parent inheritance and alias-target-derived policy. Parent inheritance means the leaf name simply relies on a policy published higher in its own DNS tree. Alias-target-derived policy means the effective CAA surfaced through an alias response. In this corpus, that often marks a managed rail or specialist external platform rather than a plain brand-front hostname."
+    )
+    lines.append(
+        r"In practical terms, most names in this corpus fall into three shapes: inherited corporate policy, alias-driven managed-platform policy, or no CAA at all. That three-way split is more important than the mechanics themselves, because it shows where issuance control is broad, where it is deliberately narrow, and where it is absent."
+    )
+    lines.append(r"\subsection{Policy Regimes By Configured Zone}")
+    for zone in caa_analysis.configured_domains:
+        lines.append(rf"\subsubsection{{{latex_escape(zone)}}}")
+        lines.extend(
+            [
+                r"\begin{longtable}{>{\raggedright\arraybackslash}p{0.25\linewidth} >{\raggedleft\arraybackslash}p{0.10\linewidth} >{\raggedright\arraybackslash}p{0.53\linewidth}}",
+                r"\toprule",
+                r"Policy Regime & Names & Plain-Language Meaning \\",
+                r"\midrule",
+            ]
+        )
+        for regime, count, meaning in caa_zone_rows[zone]:
+            lines.append(rf"{latex_escape(regime)} & {latex_escape(count)} & {latex_escape(meaning)} \\")
+        lines.extend([r"\bottomrule", r"\end{longtable}"])
+    if secondary_zone:
+        lines.append(
+            rf"The contrast between \texttt{{{latex_escape(primary_zone)}}} and \texttt{{{latex_escape(secondary_zone)}}} is one of the strongest PKI-governance findings in the corpus. \texttt{{{latex_escape(primary_zone)}}} is policy-layered and governed, while \texttt{{{latex_escape(secondary_zone)}}} is currently CAA-empty in the scanned name set. That does not make \texttt{{{latex_escape(secondary_zone)}}} invalid, but it does mean DNS is not constraining public CA choice there."
+        )
+        lines.append(
+            rf"That asymmetry matters more than any one record. \texttt{{{latex_escape(primary_zone)}}} looks like a namespace where DNS is being used as an issuance-governance tool. \texttt{{{latex_escape(secondary_zone)}}} looks like a namespace where issuance choice is still being handled outside DNS policy, or not being constrained at all."
+        )
+    lines.extend(
+        [
+            r"\subsection{How CAA Changes The Reading Of The Estate}",
+            r"The CAA layer strengthens the earlier certificate-and-DNS thesis rather than overturning it. The same service families that already looked like shared managed rails from naming and DNS often sit under narrower issuance policy as well.",
+            rf"In \texttt{{{latex_escape(primary_zone)}}}, the current CAA friction is concentrated rather than diffuse: {latex_escape(caa_concentration_text(caa_analysis, primary_zone))}.",
+            r"Broad corporate default policy remains visible on many ordinary brand-facing names. That supports the earlier reading that not every public hostname was moved onto one tightly managed delivery rail.",
+            r"Narrower or alias-driven CAA policy appears where the DNS evidence already suggested a managed platform, campaign rail, or vendor-mediated service surface.",
+            r"Vendor-style exceptions still exist. Where a name resolves through a specialist external platform and the allowed CA set widens or changes shape, the policy layer supports the earlier vendor-delegation reading rather than contradicting it.",
+            r"The chapter therefore adds a governance gradient to the earlier thesis: some parts of the estate are tightly steered, some inherit a broad default, and some are still policy-empty.",
+            r"\subsection{Why The Next Two Tables Matter}",
+            r"\begin{itemize}[leftmargin=1.4em]",
+            r"\item The overlap table shows where an old and a new issuance regime are both still live on the same DNS name.",
+            r"\item The mismatch table shows where today's DNS policy has already moved, but one or more live certificates still reflect the older state.",
+            r"\item Read them together, not separately. Together they show whether the estate looks diffusely messy or whether the untidy parts cluster in a small transition zone.",
+            r"\end{itemize}",
+            r"\subsection{Current Multi-Family Overlap}",
+        ]
+    )
+    if caa_analysis.multi_family_overlap_names:
+        append_longtable(
+            lines,
+            r">{\raggedright\arraybackslash}p{0.27\linewidth} >{\raggedright\arraybackslash}p{0.10\linewidth} >{\raggedright\arraybackslash}p{0.16\linewidth} >{\raggedright\arraybackslash}p{0.33\linewidth}",
+            ["DNS Name", "Zone", "Live CA Families", "Covering Subject CNs"],
+            top_caa_overlap_rows(caa_analysis),
+            font="footnotesize",
+            tabcolsep="3.2pt",
+        )
+    else:
+        lines.append(r"No current multi-family overlap names were found.")
+    lines.append(
+        r"These overlap names are operationally important. They show where the same public DNS name is currently covered by more than one live CA family at once. In this corpus, that behavior clusters tightly in a few service families rather than being spread randomly across the estate."
+    )
+    lines.append(r"\subsection{Current Policy Mismatch}")
+    if caa_analysis.policy_mismatch_names:
+        append_longtable(
+            lines,
+            r">{\raggedright\arraybackslash}p{0.24\linewidth} >{\raggedright\arraybackslash}p{0.10\linewidth} >{\raggedright\arraybackslash}p{0.14\linewidth} >{\raggedright\arraybackslash}p{0.16\linewidth} >{\raggedright\arraybackslash}p{0.20\linewidth}",
+            ["DNS Name", "Zone", "Live CA Families", "CAA-Allowed Families", "CAA Discovery Result"],
+            top_caa_mismatch_rows(caa_analysis),
+            font="footnotesize",
+            tabcolsep="3.0pt",
+        )
+    else:
+        lines.append(r"No current policy-mismatch names were found.")
+    lines.append(
+        r"A current policy mismatch does not automatically prove CA misissuance. CAA only proves what DNS authorizes now. Certificates can remain valid after the DNS-side policy has changed, so the right reading here is current policy lag or migration residue unless the historical issuance-time DNS can also be shown."
+    )
+    lines.append(
+        r"Taken together, the overlap and mismatch tables support a migration reading more than a disorder reading. If the estate were simply chaotic, the live friction would be spread widely across unrelated names. Instead, it clusters in a small number of service families that were already prominent in the certificate and DNS chapters."
+    )
+
+    if focus_analysis:
+        lines.append(r"\section{Focused Subject-CN Cohort}")
+        add_summary(
+            [
+                f"The focused cohort contains {focus_analysis.provided_subjects_count} analyst-selected Subject CN values. {focus_analysis.historically_seen_subjects_count} are visible somewhere in the historical CT corpus, and {focus_analysis.current_direct_subjects_count} still have direct current certificates.",
+                f"The current focused cohort is structurally different from the rest of the estate: all {focus_analysis.current_focus_certificate_count} current focused certificates are Sectigo/COMODO-lineage, compared with {counter_text(focus_analysis.rest_current_issuer_families, 3)} in the rest of the corpus.",
+                f"The focused cohort uses much smaller certificates: median SAN size {focus_analysis.focus_median_san_entries} versus {focus_analysis.rest_median_san_entries}, and {focus_analysis.focus_multi_zone_certificate_count} current multi-zone certificates versus {focus_analysis.rest_multi_zone_certificate_count} outside the cohort.",
+                f"Revocation churn is much higher inside the focused cohort: {focus_analysis.focus_revoked_current_count} revoked versus {focus_analysis.focus_not_revoked_current_count} not revoked ({focus_analysis.focus_revoked_share}), compared with {focus_analysis.rest_revoked_current_count} versus {focus_analysis.rest_not_revoked_current_count} ({focus_analysis.rest_revoked_share}) outside the cohort.",
+                f"Cross-basket carrying is limited rather than universal. The count of focused entries that appear today only as SAN passengers is {focus_analysis.current_carried_only_subjects_count}, and the count ever seen as SAN passengers inside non-focused certificates at all is {focus_analysis.historical_non_focus_carried_subjects_count}.",
+                f"The cohort splits into three naming buckets rather than one uniform style: {focus_analysis.bucket_counts.get('direct_front_door', 0)} front-door direct names, {focus_analysis.bucket_counts.get('platform_matrix_anchor', 0)} platform-anchor matrix names, and {focus_analysis.bucket_counts.get('ambiguous_legacy', 0)} ambiguous or legacy-residue names.",
+            ]
+        )
+        lines.append(
+            r"This chapter treats the supplied Subject-CN list as an analyst-guided cohort rather than as a neutral statistical sample. The question is not whether these names are the most common names in the estate. The question is why they were memorable enough to be singled out, and whether the certificate and DNS evidence shows that they belong to a different naming and hosting tradition."
+        )
+        lines.append(
+            r"The short answer is yes, but not because the cohort is perfectly uniform. The cohort is different from the wider estate because it is weighted toward remembered public fronts and remembered platform anchors, not toward the Amazon-heavy operational rail population that dominates the broader corpus."
+        )
+        lines.extend(
+            [
+                r"\subsection{Focused Cohort Versus The Rest Of The Estate}",
+            ]
+        )
+        append_longtable(
+            lines,
+            r">{\raggedright\arraybackslash}p{0.27\linewidth} >{\raggedright\arraybackslash}p{0.16\linewidth} >{\raggedright\arraybackslash}p{0.16\linewidth} >{\raggedright\arraybackslash}p{0.31\linewidth}",
+            ["Comparison View", "Focused Cohort", "Rest Of Current Corpus", "Why It Matters"],
+            focus_comparison,
+            font="footnotesize",
+            tabcolsep="3.2pt",
+        )
+        lines.extend(
+            [
+                r"\subsection{Three Buckets Inside The Cohort}",
+            ]
+        )
+        append_longtable(
+            lines,
+            r">{\raggedright\arraybackslash}p{0.15\linewidth} >{\raggedleft\arraybackslash}p{0.06\linewidth} >{\raggedright\arraybackslash}p{0.19\linewidth} >{\raggedright\arraybackslash}p{0.24\linewidth} >{\raggedright\arraybackslash}p{0.26\linewidth}",
+            ["Bucket", "Count", "Representative Names", "What It Looks Like", "Why This Bucket Exists"],
+            focus_bucket_summary,
+            font="footnotesize",
+            tabcolsep="3.0pt",
+        )
+        lines.extend(
+            [
+                r"This bucket split is the key to making the cohort intelligible. The memorable names are not all from one naming methodology. Most are direct public fronts. A very small number are platform-anchor certificates with matrix SAN design. The rest are historical leftovers, carried aliases, or opaque labels whose original role is no longer cleanly visible in the current corpus.",
+                r"\subsection{Why This Cohort Feels Different}",
+                r"\begin{itemize}[leftmargin=1.4em]",
+                r"\item The dominant bucket is the front-door direct bucket. These are small-SAN certificates attached to memorable service, identity, vendor, or brand-like names directly under the branded public zones configured for the scan.",
+                r"\item The platform-anchor bucket is tiny but important. These names carry large SAN matrices that spell out environment, tenant, service-cell, or monitoring coverage, which is exactly what one would expect from a centrally managed operational platform slice.",
+                r"\item The ambiguous bucket matters because it explains the leftover rough edges. These names may be historical-only, partly migrated into other certificates, or too opaque to decode confidently from public evidence alone.",
+                r"\item The public DNS evidence for the current focused Subject CN names is also different. The cohort lands much more often on direct addresses or simple direct AWS clues, while the wider current Subject-CN population is much more dominated by Adobe-managed, Apigee-managed, or NXDOMAIN outcomes.",
+                r"\item Historical red flags are common in the cohort, but they are mostly past rather than current. That is consistent with a legacy or manually managed public-web slice that has been cleaned up over time rather than with a currently chaotic platform core.",
+                r"\end{itemize}",
+            ]
+        )
+        lines.append(
+            r"Seen this way, the cohort makes sense. It looks like a remembered estate made of two high-visibility extremes: public-facing service fronts that humans remember because customers and staff encounter them directly, and a small number of operational anchor names that humans remember because administrators, testers, or engineers encounter them repeatedly. The ambiguous bucket is the residue between those two poles."
+        )
+        lines.append(r"\subsection{Cross-Basket Carrying And Migration}")
+        if focus_analysis.transition_rows:
+            append_longtable(
+                lines,
+                r">{\raggedright\arraybackslash}p{0.22\linewidth} >{\raggedright\arraybackslash}p{0.19\linewidth} >{\raggedright\arraybackslash}p{0.11\linewidth} >{\raggedleft\arraybackslash}p{0.09\linewidth} >{\raggedright\arraybackslash}p{0.27\linewidth}",
+                ["Subject CN", "Current Basket Status", "Direct / Carried", "Max Overlap Days", "Carrier Subjects"],
+                [
+                    [
+                        detail.subject_cn,
+                        detail.basket_status,
+                        f"{detail.current_direct_certificates}/{detail.current_non_focus_san_carriers + detail.historical_non_focus_san_carriers}",
+                        str(detail.max_direct_to_carrier_overlap_days),
+                        truncate_text(detail.carrier_subjects, 48),
+                    ]
+                    for detail in focus_analysis.transition_rows[:10]
+                ],
+                font="footnotesize",
+                tabcolsep="3.1pt",
+            )
+        else:
+            lines.append(r"No focused names were seen as SAN passengers inside non-focused certificates.")
+        lines.append(
+            r"This migration table answers a narrower question than the rest of the chapter. It asks whether these names were gradually absorbed into broader certificates from outside the cohort. The answer is: only in a limited number of cases. Some names do show SAN-passenger behavior or historical carrying, but that is not the dominant explanation for why the cohort feels different. The dominant explanation is the bucket split above: many remembered direct fronts, a few large platform anchors, and a band of legacy residue."
+        )
+        lines.append(r"\subsection{Representative Names By Bucket}")
+        append_longtable(
+            lines,
+            r">{\raggedright\arraybackslash}p{0.14\linewidth} >{\raggedright\arraybackslash}p{0.17\linewidth} >{\raggedright\arraybackslash}p{0.18\linewidth} >{\raggedright\arraybackslash}p{0.09\linewidth} >{\raggedright\arraybackslash}p{0.30\linewidth}",
+            ["Bucket", "Subject CN", "Observed Role", "Direct C/H", "Why It Helps Explain The Bucket"],
+            focus_representatives,
+            font="footnotesize",
+            tabcolsep="3.0pt",
+        )
+        lines.append(
+            r"These examples are evidence anchors, not the whole population. The direct-front examples show the remembered public surface. The platform-anchor examples show the rare but important matrix certificates. The ambiguous examples show why the cohort cannot be reduced to a single neat story without losing the migration and legacy residue that made these names memorable in the first place."
+        )
+
+    lines.append(r"\Needspace{12\baselineskip}")
+    lines.append(r"\section{Making The Whole Estate Make Sense}")
+    add_summary(
+        [
+            "The certificate, DNS, and CAA layers are not three separate stories. They are three views of the same operating estate.",
+            "Naming shows role and organisational memory; DNS shows where traffic lands; CAA shows how tightly issuance is governed.",
+            "Clean public brand names usually sit closest to the customer surface, while dense SAN sets, numbered families, multi-zone certificates, and narrower CAA policy usually expose the shared platform layer beneath them.",
+            "When the layers disagree, the disagreement usually signals migration or uneven governance maturity rather than a flat contradiction.",
+            "The overall pattern is more consistent with a federated operating model with uneven governance maturity than with random hostname sprawl.",
+        ]
+    )
+    lines.append(
+        r"The common ground is operational reality. A branded proposition wants recognisable names. A service team wants a stable endpoint namespace. A platform team wants shared rails and repeatable delivery machinery. A hosting team wants routable front doors that can land on cloud distribution, gateways, or workflow platforms. A security or PKI function wants some names tightly governed and other names left broad or delegated. Certificates, DNS, and CAA tell the same estate story from different angles."
+    )
+    lines.append(
+        r"A useful way to combine the layers is to ask four questions in order. First, what does the name itself look like: a direct front door, a numbered rail, an environment slice, or a bridge across business zones? Second, how broad is the SAN set: is this one visible service or a bundled platform certificate? Third, where does public DNS actually land the name: direct host, CDN edge, API gateway, campaign rail, or specialist platform? Fourth, does DNS issuance policy stay broad, narrow sharply, or disappear entirely?"
+    )
+    lines.append(
+        r"When those answers align, the reading becomes strong. A small-SAN branded name with ordinary inherited policy reads like a direct public front. A dense multi-zone certificate with numbered families, managed DNS landing, and narrower CAA reads like a shared operational rail. A name that lands on AWS but still uses a Sectigo-family certificate shows that hosting choice and CA choice are separate decisions. A name with current overlap and current policy mismatch shows a transition area where the newer issuance model is already in place but the older certificate state has not fully disappeared."
+    )
+    lines.append(
+        r"This is why the estate can look both tidy and messy at once. It is tidy within each layer, but messy across layers because the layers are solving different problems. The new CAA evidence sharpens that point rather than contradicting it: the managed rail families are not only named and hosted differently, they are often policy-controlled differently as well. The biggest qualification is that governance is uneven. The primary configured zone shows layered issuance control, while another configured zone remains CAA-empty. That is not random chaos, but it is also not uniform control maturity."
+    )
+
+    lines.append(r"\section{Limits, Confidence, and Noise}")
+    add_summary(
+        [
+            "High-confidence claims are tied directly to certificate fields, DNS answers, live trust records, and current CAA policy.",
+            "Medium-confidence claims are organisational readings drawn from repeated technical patterns.",
+            "Lower-confidence claims are exact expansions of abbreviations and exact ownership boundaries inferred from names alone.",
+            "A public NXDOMAIN today does not automatically contradict a valid certificate because DNS and certificate lifecycles move on different clocks.",
+            "A current CAA mismatch does not by itself prove historical CA non-compliance, because DNS policy may have changed after issuance.",
+        ]
+    )
+    lines.append(
+        r"A useful way to read the corpus is to separate signal from noise. Repeated naming schemas are signal. Repeated DNS outcomes are signal. Which public CA family keeps issuing a name is signal. Where CAA is broad, narrow, delegated, or absent is signal. Simple \texttt{www} presence or absence is weak evidence either way unless it coincides with stronger differences such as distinct DNS routing, distinct SAN composition, a distinct certificate renewal history, or a distinct issuance-policy shape."
+    )
+
+    lines.extend(
+        [
+            r"\clearpage",
+            r"\appendix",
+            r"\section{Full Family Catalogue}",
+            r"This appendix is a compact family map. It is not the place for full per-certificate evidence; that remains in the detailed inventory appendix at the end of the monograph.",
+        ]
+    )
+    append_longtable(
+        lines,
+        r">{\raggedright\arraybackslash}p{0.56\linewidth} >{\raggedleft\arraybackslash}p{0.09\linewidth} >{\raggedleft\arraybackslash}p{0.08\linewidth} >{\raggedright\arraybackslash}p{0.18\linewidth}",
+        ["Family Basis", "Certs", "CNs", "Dominant Stack"],
+        family_rows,
+        font="footnotesize",
+        tabcolsep="3.0pt",
+    )
+
+    lines.extend(
+        [
+            r"\section{Historical Red-Flag Detail}",
+            r"This appendix keeps the detailed historical evidence inside the monograph so that the reader does not need a second report. Each subsection answers one narrow question. If a column does not help answer that question, it has been removed.",
+            r"In this appendix, a renewal family means repeated certificates that keep the same apparent identity over time: the same Subject CN, the same full Subject DN, the same SAN profile, and the same CA family.",
+            r"\subsection{Current Red-Flag Inventory}",
+        ]
+    )
+    if assessment.current_red_flag_rows:
+        append_longtable(
+            lines,
+            r">{\raggedright\arraybackslash}p{0.28\linewidth} >{\raggedleft\arraybackslash}p{0.09\linewidth} >{\raggedright\arraybackslash}p{0.24\linewidth} >{\raggedright\arraybackslash}p{0.27\linewidth}",
+            ["Subject CN", "Live Certs", "Current Concern", "Supporting Context"],
+            [
+                [
+                    row.subject_cn,
+                    str(row.current_certificate_count),
+                    row.flags,
+                    truncate_text(row.notes, 84),
+                ]
+                for row in assessment.current_red_flag_rows
+            ],
+            font="footnotesize",
+            tabcolsep="3.0pt",
+        )
+    else:
+        lines.append(r"No current red flags were found.")
+    lines.append(r"\subsection{Past Red-Flag Inventory Now Fixed}")
+    if assessment.past_red_flag_rows:
+        append_longtable(
+            lines,
+            r">{\raggedright\arraybackslash}p{0.28\linewidth} >{\raggedleft\arraybackslash}p{0.09\linewidth} >{\raggedright\arraybackslash}p{0.24\linewidth} >{\raggedright\arraybackslash}p{0.27\linewidth}",
+            ["Subject CN", "Historic Certs", "Historical Concern", "Supporting Context"],
+            [
+                [
+                    row.subject_cn,
+                    str(row.certificate_count),
+                    row.flags,
+                    truncate_text(row.notes, 84),
+                ]
+                for row in assessment.past_red_flag_rows
+            ],
+            font="footnotesize",
+            tabcolsep="3.0pt",
+        )
+    else:
+        lines.append(r"No past-only red flags were found.")
+    lines.append(r"\subsection{Current Overlap Red Flags}")
+    if assessment.overlap_current_rows:
+        append_longtable(
+            lines,
+            r">{\raggedright\arraybackslash}p{0.21\linewidth} >{\raggedleft\arraybackslash}p{0.10\linewidth} >{\raggedleft\arraybackslash}p{0.08\linewidth} >{\raggedright\arraybackslash}p{0.51\linewidth}",
+            ["Subject CN", "Max Overlap Days", "Live Certs", "What The Renewal Family Looks Like"],
+            [
+                [
+                    row.subject_cn,
+                    str(row.max_overlap_days),
+                    str(row.current_certificate_count),
+                    f"{row.lineage}; {overlap_signal(row.details)}",
+                ]
+                for row in assessment.overlap_current_rows
+            ],
+            font="footnotesize",
+            tabcolsep="3.0pt",
+        )
+    else:
+        lines.append(r"No current overlap red flags were found.")
+    lines.append(r"\subsection{Past Overlap Red Flags Now Fixed}")
+    if assessment.overlap_past_rows:
+        append_longtable(
+            lines,
+            r">{\raggedright\arraybackslash}p{0.21\linewidth} >{\raggedleft\arraybackslash}p{0.10\linewidth} >{\raggedleft\arraybackslash}p{0.09\linewidth} >{\raggedright\arraybackslash}p{0.52\linewidth}",
+            ["Subject CN", "Max Overlap Days", "Historic Certs", "What The Renewal Family Looks Like"],
+            [
+                [
+                    row.subject_cn,
+                    str(row.max_overlap_days),
+                    str(row.asset_variant_count),
+                    f"{row.lineage}; {overlap_signal(row.details)}",
+                ]
+                for row in assessment.overlap_past_rows
+            ],
+            font="footnotesize",
+            tabcolsep="3.0pt",
+        )
+    else:
+        lines.append(r"No past overlap red flags were found.")
+    lines.append(r"\subsection{Current Subject-DN Drift}")
+    if assessment.dn_current_rows:
+        append_longtable(
+            lines,
+            r">{\raggedright\arraybackslash}p{0.25\linewidth} >{\raggedleft\arraybackslash}p{0.10\linewidth} >{\raggedleft\arraybackslash}p{0.08\linewidth} >{\raggedright\arraybackslash}p{0.45\linewidth}",
+            ["Subject CN", "Distinct Subject DNs", "Live Certs", "Subject DN Samples"],
+            [
+                [
+                    row.subject_cn,
+                    str(row.distinct_value_count),
+                    str(row.current_certificate_count),
+                    truncate_text(row.details, 92),
+                ]
+                for row in assessment.dn_current_rows
+            ],
+            font="footnotesize",
+            tabcolsep="3.0pt",
+        )
+    else:
+        lines.append(r"No current Subject-DN drift was found.")
+    lines.append(r"\subsection{Past Subject-DN Drift Now Fixed}")
+    if assessment.dn_past_rows:
+        append_longtable(
+            lines,
+            r">{\raggedright\arraybackslash}p{0.25\linewidth} >{\raggedleft\arraybackslash}p{0.10\linewidth} >{\raggedleft\arraybackslash}p{0.09\linewidth} >{\raggedright\arraybackslash}p{0.45\linewidth}",
+            ["Subject CN", "Distinct Subject DNs", "Historic Certs", "Subject DN Samples"],
+            [
+                [
+                    row.subject_cn,
+                    str(row.distinct_value_count),
+                    str(row.certificate_count),
+                    truncate_text(row.details, 92),
+                ]
+                for row in assessment.dn_past_rows
+            ],
+            font="footnotesize",
+            tabcolsep="3.0pt",
+        )
+    else:
+        lines.append(r"No past-only Subject-DN drift was found.")
+    lines.append(r"\subsection{Current CA-Family Drift}")
+    if assessment.vendor_current_rows:
+        append_longtable(
+            lines,
+            r">{\raggedright\arraybackslash}p{0.27\linewidth} >{\raggedleft\arraybackslash}p{0.10\linewidth} >{\raggedleft\arraybackslash}p{0.08\linewidth} >{\raggedright\arraybackslash}p{0.45\linewidth}",
+            ["Subject CN", "Distinct CA Families", "Live Certs", "CA Families Seen"],
+            [
+                [
+                    row.subject_cn,
+                    str(row.distinct_value_count),
+                    str(row.current_certificate_count),
+                    truncate_text(row.details, 92),
+                ]
+                for row in assessment.vendor_current_rows
+            ],
+            font="footnotesize",
+            tabcolsep="3.0pt",
+        )
+    else:
+        lines.append(r"No current CA-family drift was found.")
+    lines.append(r"\subsection{Past CA-Family Drift Now Fixed}")
+    if assessment.vendor_past_rows:
+        append_longtable(
+            lines,
+            r">{\raggedright\arraybackslash}p{0.27\linewidth} >{\raggedleft\arraybackslash}p{0.10\linewidth} >{\raggedleft\arraybackslash}p{0.09\linewidth} >{\raggedright\arraybackslash}p{0.45\linewidth}",
+            ["Subject CN", "Distinct CA Families", "Historic Certs", "CA Families Seen"],
+            [
+                [
+                    row.subject_cn,
+                    str(row.distinct_value_count),
+                    str(row.certificate_count),
+                    truncate_text(row.details, 92),
+                ]
+                for row in assessment.vendor_past_rows
+            ],
+            font="footnotesize",
+            tabcolsep="3.0pt",
+        )
+    else:
+        lines.append(r"No past-only CA-family drift was found.")
+    lines.append(r"\subsection{Current SAN Drift}")
+    if assessment.san_current_rows:
+        append_longtable(
+            lines,
+            r">{\raggedright\arraybackslash}p{0.21\linewidth} >{\raggedleft\arraybackslash}p{0.08\linewidth} >{\raggedleft\arraybackslash}p{0.08\linewidth} >{\raggedright\arraybackslash}p{0.18\linewidth} >{\raggedright\arraybackslash}p{0.35\linewidth}",
+            ["Subject CN", "Profiles", "Live Certs", "Delta Pattern", "Representative Delta"],
+            [
+                [
+                    row.subject_cn,
+                    str(row.distinct_san_profiles),
+                    str(row.current_certificate_count),
+                    row.delta_pattern,
+                    truncate_text(row.representative_delta, 92),
+                ]
+                for row in assessment.san_current_rows
+            ],
+            font="footnotesize",
+            tabcolsep="3.0pt",
+        )
+    else:
+        lines.append(r"No current SAN drift was found.")
+    lines.append(r"\subsection{Past SAN Drift Now Fixed}")
+    if assessment.san_past_rows:
+        append_longtable(
+            lines,
+            r">{\raggedright\arraybackslash}p{0.21\linewidth} >{\raggedleft\arraybackslash}p{0.08\linewidth} >{\raggedleft\arraybackslash}p{0.09\linewidth} >{\raggedright\arraybackslash}p{0.18\linewidth} >{\raggedright\arraybackslash}p{0.35\linewidth}",
+            ["Subject CN", "Profiles", "Historic Certs", "Delta Pattern", "Representative Delta"],
+            [
+                [
+                    row.subject_cn,
+                    str(row.distinct_san_profiles),
+                    str(row.certificate_count),
+                    row.delta_pattern,
+                    truncate_text(row.representative_delta, 92),
+                ]
+                for row in assessment.san_past_rows
+            ],
+            font="footnotesize",
+            tabcolsep="3.0pt",
+        )
+    else:
+        lines.append(r"No past-only SAN drift was found.")
+    lines.append(r"\subsection{Historic Start Dates}")
+    append_longtable(
+        lines,
+        r">{\raggedright\arraybackslash}p{0.16\linewidth} >{\raggedleft\arraybackslash}p{0.10\linewidth} >{\raggedright\arraybackslash}p{0.62\linewidth}",
+        ["Start Day", "Certificates", "Dominant Driver"],
+        [
+            [
+                row.start_day,
+                str(row.certificate_count),
+                driver_summary(row.top_subjects, row.top_issuers),
+            ]
+            for row in assessment.day_rows
+        ],
+        font="footnotesize",
+        tabcolsep="3.0pt",
+    )
+    lines.append(r"\subsection{Historic Step Weeks}")
+    if assessment.week_rows:
+        append_longtable(
+            lines,
+            r">{\raggedright\arraybackslash}p{0.16\linewidth} >{\raggedleft\arraybackslash}p{0.09\linewidth} >{\raggedleft\arraybackslash}p{0.13\linewidth} >{\raggedright\arraybackslash}p{0.52\linewidth}",
+            ["Week Start", "Certs", "Prior 8-Week Avg", "Dominant Driver"],
+            [
+                [
+                    row.week_start,
+                    str(row.certificate_count),
+                    row.prior_eight_week_avg,
+                    driver_summary(row.top_subjects, row.top_issuers),
+                ]
+                for row in assessment.week_rows
+            ],
+            font="footnotesize",
+            tabcolsep="3.0pt",
+        )
+    else:
+        lines.append(r"No step weeks met the threshold.")
+
+    lines.extend(
+        [
+            r"\section{CAA Policy Detail}",
+            r"This appendix keeps the issuance-policy evidence inside the monograph. It answers a narrower question than the DNS appendix: not where a name lands, but which public CA families DNS currently authorizes to issue for that name.",
+            r"\subsection{CAA Discovery Paths}",
+        ]
+    )
+    append_longtable(
+        lines,
+        r">{\raggedright\arraybackslash}p{0.23\linewidth} >{\raggedleft\arraybackslash}p{0.10\linewidth} >{\raggedright\arraybackslash}p{0.55\linewidth}",
+        ["CAA Discovery Result", "Names", "Meaning"],
+        caa_source_rows(caa_analysis),
+        font="footnotesize",
+        tabcolsep="3.0pt",
+    )
+    lines.append(r"\subsection{Policy Regimes By Configured Zone}")
+    for zone in caa_analysis.configured_domains:
+        lines.append(rf"\subsubsection{{{latex_escape(zone)}}}")
+        append_longtable(
+            lines,
+            r">{\raggedright\arraybackslash}p{0.24\linewidth} >{\raggedleft\arraybackslash}p{0.10\linewidth} >{\raggedright\arraybackslash}p{0.56\linewidth}",
+            ["Policy Regime", "Names", "Plain-Language Meaning"],
+            caa_zone_rows[zone],
+            font="footnotesize",
+            tabcolsep="3.0pt",
+        )
+    lines.append(r"\subsection{Current Multi-Family Overlap}")
+    if caa_analysis.multi_family_overlap_names:
+        lines.extend(
+            [
+                r"\begin{longtable}{>{\raggedright\arraybackslash}p{0.29\linewidth} >{\raggedright\arraybackslash}p{0.14\linewidth} >{\raggedright\arraybackslash}p{0.17\linewidth} >{\raggedright\arraybackslash}p{0.28\linewidth}}",
+                r"\toprule",
+                r"DNS Name & Zone & Live CA Families & Covering Subject CNs \\",
+                r"\midrule",
+            ]
+        )
+        for name, zone, families, subjects in top_caa_overlap_rows(caa_analysis, 40):
+            lines.append(rf"{latex_escape(name)} & {latex_escape(zone)} & {latex_escape(families)} & {latex_escape(subjects)} \\")
+        lines.extend([r"\bottomrule", r"\end{longtable}"])
+    else:
+        lines.append(r"No current multi-family overlap names were found.")
+    lines.append(r"\subsection{Current Policy Mismatch}")
+    if caa_analysis.policy_mismatch_names:
+        lines.extend(
+            [
+                r"\begin{longtable}{>{\raggedright\arraybackslash}p{0.27\linewidth} >{\raggedright\arraybackslash}p{0.12\linewidth} >{\raggedright\arraybackslash}p{0.16\linewidth} >{\raggedright\arraybackslash}p{0.18\linewidth} >{\raggedright\arraybackslash}p{0.17\linewidth}}",
+                r"\toprule",
+                r"DNS Name & Zone & Live CA Families & CAA-Allowed Families & CAA Discovery Result \\",
+                r"\midrule",
+            ]
+        )
+        for name, zone, families, allowed, result in top_caa_mismatch_rows(caa_analysis, 40):
+            lines.append(rf"{latex_escape(name)} & {latex_escape(zone)} & {latex_escape(families)} & {latex_escape(allowed)} & {latex_escape(result)} \\")
+        lines.extend([r"\bottomrule", r"\end{longtable}"])
+    else:
+        lines.append(r"No current policy-mismatch names were found.")
+
+    if focus_analysis:
+        lines.extend(
+            [
+                r"\section{Focused Subject-CN Detail}",
+                r"This appendix keeps the complete focused-cohort table inside the monograph, but it now follows the three-bucket taxonomy from Chapter 8. That makes it easier to read the cohort as a set of related naming traditions instead of as one flat mixed list.",
+            ]
+        )
+        appendix_buckets = [
+            ("direct_front_door", r"\subsection{Front-Door Direct Names}"),
+            ("platform_matrix_anchor", r"\subsection{Platform-Anchor Matrix Names}"),
+            ("ambiguous_legacy", r"\subsection{Ambiguous Or Legacy Residue}"),
+        ]
+        for bucket, heading in appendix_buckets:
+            lines.append(heading)
+            lines.append(
+                rf"{latex_escape(ct_focus_subjects.taxonomy_bucket_label(bucket))} count: {focus_analysis.bucket_counts.get(bucket, 0)}."
+            )
+            rows = focus_appendix_rows(focus_analysis, bucket)
+            if rows:
+                lines.extend(
+                    [
+                        r"\begin{longtable}{>{\raggedright\arraybackslash}p{0.16\linewidth} >{\raggedright\arraybackslash}p{0.18\linewidth} >{\raggedright\arraybackslash}p{0.12\linewidth} >{\raggedright\arraybackslash}p{0.15\linewidth} >{\raggedright\arraybackslash}p{0.06\linewidth} >{\raggedright\arraybackslash}p{0.07\linewidth} >{\raggedright\arraybackslash}p{0.07\linewidth} >{\raggedright\arraybackslash}p{0.10\linewidth} >{\raggedright\arraybackslash}p{0.10\linewidth} >{\raggedright\arraybackslash}p{0.07\linewidth} >{\raggedright\arraybackslash}p{0.07\linewidth}}",
+                        r"\toprule",
+                        r"Subject CN & Bucket Rationale & Analyst Note & Observed Role & Direct C/H & Carried C/H & SANs C/H & Current DNS Outcome & Current Revocation Mix & Current Flags & Past Flags \\",
+                        r"\midrule",
+                    ]
+                )
+                for row in rows:
+                    lines.append(
+                        rf"{latex_escape(row[0])} & {latex_escape(row[1])} & {latex_escape(row[2])} & {latex_escape(row[3])} & {latex_escape(row[4])} & {latex_escape(row[5])} & {latex_escape(row[6])} & {latex_escape(row[7])} & {latex_escape(row[8])} & {latex_escape(row[9])} & {latex_escape(row[10])} \\"
+                    )
+                lines.extend([r"\bottomrule", r"\end{longtable}"])
+            else:
+                lines.append(r"No subjects fell into this bucket.")
+
+    lines.extend(
+        [
+            r"\section{Detailed Inventory Appendix}",
+            r"This appendix reproduces the full issuer-first family inventory so that the publication remains complete rather than merely interpretive.",
+            rf"\includepdf[pages=-,pagecommand={{}}]{{{latex_escape(appendix_pdf_path)}}}",
+            r"\end{document}",
+        ]
+    )
+    def soften_heading(line: str) -> str:
+        if line.startswith(r"\subsection{"):
+            return line.replace(r"\subsection{", r"\SoftSubsection{", 1)
+        if line.startswith(r"\subsubsection{"):
+            return line.replace(r"\subsubsection{", r"\SoftSubsubsection{", 1)
+        return line
+
+    args.latex_output.write_text(
+        "\n".join(soften_heading(line) for line in lines) + "\n",
+        encoding="utf-8",
+    )
+
+

What this block is doing

Writes the narrative monograph in LaTeX.

+

Flow arrows

Current-state facts, history, CAA, and focused-cohort analysis. → render_latex → Produces the main LaTeX monograph source.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## main + + + + + + +
+
def main() -> int:
+    args = parse_args()
+    report = ct_master_report.summarize_for_report(args)
+    assessment = ct_lineage_report.build_assessment(build_history_args(args))
+    caa_analysis = ct_caa_analysis.build_analysis(
+        report["hits"],
+        report["domains"],
+        args.caa_cache_dir,
+        args.caa_cache_ttl_seconds,
+    )
+    focus_subjects = ct_focus_subjects.load_focus_subjects(args.focus_subjects_file)
+    focus_analysis = ct_focus_subjects.build_analysis(
+        focus_subjects,
+        report,
+        assessment,
+        args.dns_cache_dir,
+        args.dns_cache_ttl_seconds,
+    )
+    render_appendix_inventory(args, report)
+    render_markdown(args, report, assessment, caa_analysis, focus_analysis)
+    render_latex(args, report, assessment, caa_analysis, focus_analysis)
+    if not args.skip_pdf:
+        ct_scan.compile_latex_to_pdf(args.latex_output, args.pdf_output, args.pdf_engine)
+    if not args.quiet:
+        print(
+            f"[report] markdown={args.markdown_output} latex={args.latex_output}"
+            + ("" if args.skip_pdf else f" pdf={args.pdf_output}"),
+            file=__import__("sys").stderr,
+        )
+    return 0
+
+

What this block is doing

The top-level command-line entrypoint for the complete monograph build.

+

Flow arrows

CLI arguments from the operator. → main → Runs the full publication pipeline from raw analytics to finished PDF.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ diff --git a/teachingNoobs/ct_scan.md b/teachingNoobs/ct_scan.md new file mode 100644 index 0000000..62950aa --- /dev/null +++ b/teachingNoobs/ct_scan.md @@ -0,0 +1,2168 @@ +# ct_scan.py + +Source file: [`ct_scan.py`](../ct_scan.py) + +Core Certificate Transparency scanner. This file talks to crt.sh's public database, downloads the real certificate bytes, verifies that they are real leaf certificates, groups them into readable families, and can render the full inventory appendix. + +Main flow in one line: `domains file -> raw CT query -> parsed leaf certificates -> CN families -> issuer trust -> appendix reports` + +How to read this page: + +- left side: the actual source code block +- right side: a plain-English explanation for a beginner +- read from top to bottom because later blocks depend on earlier ones + +## Module setup + + + + + + +
+
#!/usr/bin/env python3
+
+from __future__ import annotations
+
+import argparse
+import base64
+import hashlib
+import json
+import re
+import shutil
+import subprocess
+import sys
+import time
+from collections import Counter, defaultdict
+from dataclasses import dataclass, field
+from datetime import UTC, datetime
+from pathlib import Path
+from typing import Any
+
+import psycopg
+from cryptography import x509
+from cryptography.x509 import general_name
+from cryptography.x509.oid import ExtensionOID
+from cryptography.x509.oid import NameOID
+from psycopg.rows import dict_row
+
+
+QUERY_SQL = """
+WITH ci AS (
+    SELECT
+        min(sub.certificate_id) AS id,
+        min(sub.issuer_ca_id) AS issuer_ca_id,
+        x509_commonName(sub.certificate) AS common_name,
+        x509_subjectName(sub.certificate) AS subject_dn,
+        x509_notBefore(sub.certificate) AS not_before,
+        x509_notAfter(sub.certificate) AS not_after,
+        encode(x509_serialNumber(sub.certificate), 'hex') AS serial_number,
+        sub.certificate AS certificate
+    FROM (
+        SELECT cai.*
+        FROM certificate_and_identities cai
+        WHERE plainto_tsquery('certwatch', %(domain)s) @@ identities(cai.certificate)
+          AND cai.name_value ILIKE %(name_pattern)s ESCAPE '\\'
+        LIMIT %(max_candidates)s
+    ) sub
+    GROUP BY sub.certificate
+)
+SELECT
+    ci.id,
+    ci.issuer_ca_id,
+    ca.name AS issuer_name,
+    ci.common_name,
+    ci.subject_dn,
+    ci.not_before,
+    ci.not_after,
+    cl.first_seen,
+    ci.serial_number,
+    coalesce(cl.revoked, 0) AS revoked_count,
+    rev.revocation_date,
+    rev.reason_code,
+    rev.last_seen_check_date,
+    crl_state.active_crl_count,
+    crl_state.last_checked AS crl_last_checked,
+    ci.certificate
+FROM ci
+JOIN ca ON ca.id = ci.issuer_ca_id
+JOIN certificate_lifecycle cl ON cl.certificate_id = ci.id
+LEFT JOIN LATERAL (
+    SELECT
+        cr.revocation_date,
+        cr.reason_code,
+        cr.last_seen_check_date
+    FROM crl_revoked cr
+    WHERE cr.ca_id = ci.issuer_ca_id
+      AND cr.serial_number = decode(ci.serial_number, 'hex')
+    ORDER BY cr.last_seen_check_date DESC NULLS LAST
+    LIMIT 1
+) rev ON TRUE
+LEFT JOIN LATERAL (
+    SELECT
+        count(*) FILTER (
+            WHERE crl.error_message IS NULL
+              AND crl.next_update > now() AT TIME ZONE 'UTC'
+        ) AS active_crl_count,
+        max(crl.last_checked) AS last_checked
+    FROM crl
+    WHERE crl.ca_id = ci.issuer_ca_id
+) crl_state ON TRUE
+WHERE ci.not_before <= now() AT TIME ZONE 'UTC'
+  AND ci.not_after >= now() AT TIME ZONE 'UTC'
+  AND cl.certificate_type = 'Certificate'
+ORDER BY cl.first_seen DESC NULLS LAST, ci.id DESC;
+"""
+
+
+RAW_MATCH_COUNT_SQL = """
+SELECT count(*)
+FROM certificate_and_identities cai
+WHERE plainto_tsquery('certwatch', %(domain)s) @@ identities(cai.certificate)
+  AND cai.name_value ILIKE %(name_pattern)s ESCAPE '\\'
+"""
+
+
+REVOCATION_REASONS = {
+    1: "keyCompromise",
+    2: "cACompromise",
+    3: "affiliationChanged",
+    4: "superseded",
+    5: "cessationOfOperation",
+    6: "certificateHold",
+    8: "removeFromCRL",
+    9: "privilegeWithdrawn",
+    10: "aACompromise",
+}
+
+
+PRECERT_POISON_OID = x509.ObjectIdentifier("1.3.6.1.4.1.11129.2.4.3")
+
+

What this block is doing

Imports, SQL, constants, and shared data shapes for the core CT scanner.

+

Flow arrows

Nothing yet; this is the starting point. → Module setup → `connect`, `query_domain`, `build_hits`, and the report renderers use these shared definitions.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## DatabaseRecord + + + + + + +
+
@dataclass
+class DatabaseRecord:
+    domain: str
+    certificate_id: int
+    issuer_ca_id: int
+    issuer_name: str
+    common_name: str | None
+    subject_dn: str | None
+    not_before: datetime
+    not_after: datetime
+    first_seen: datetime | None
+    serial_number: str
+    revoked_count: int
+    revocation_date: datetime | None
+    reason_code: int | None
+    last_seen_check_date: datetime | None
+    active_crl_count: int
+    crl_last_checked: datetime | None
+    certificate_der: bytes
+
+

What this block is doing

A raw row as it comes back from the crt.sh database before local cleanup.

+

Flow arrows

Earlier blocks or operator input feed this block. → DatabaseRecord → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## CertificateHit + + + + + + +
+
@dataclass
+class CertificateHit:
+    fingerprint_sha256: str
+    subject_cn: str
+    validity_not_before: datetime
+    validity_not_after: datetime
+    san_entries: list[str]
+    revocation_status: str
+    revocation_date: datetime | None
+    revocation_reason: str | None
+    revocation_note: str | None
+    crtsh_crl_timestamp: datetime | None
+    matched_domains: set[str] = field(default_factory=set)
+    first_seen: datetime | None = None
+    crtsh_certificate_ids: set[int] = field(default_factory=set)
+    serial_numbers: set[str] = field(default_factory=set)
+    issuer_names: set[str] = field(default_factory=set)
+    issuer_ca_ids: set[int] = field(default_factory=set)
+
+

What this block is doing

The cleaned working object used by the rest of the analytics pipeline.

+

Flow arrows

Earlier blocks or operator input feed this block. → CertificateHit → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## VerificationStats + + + + + + +
+
@dataclass
+class VerificationStats:
+    input_rows: int = 0
+    unique_leaf_certificates: int = 0
+    non_leaf_filtered: int = 0
+    precertificate_poison_filtered: int = 0
+
+

What this block is doing

A tiny running counter that proves how many rows were kept or rejected.

+

Flow arrows

Earlier blocks or operator input feed this block. → VerificationStats → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## CertificateGroup + + + + + + +
+
@dataclass
+class CertificateGroup:
+    group_id: str
+    group_type: str
+    member_indices: list[int]
+    member_count: int
+    distinct_subject_cn_count: int
+    distinct_exact_content_count: int
+    numbered_cn_patterns: set[str]
+    matched_domains: set[str]
+    subject_cns: set[str]
+    first_seen_min: datetime | None
+    first_seen_max: datetime | None
+    valid_from_min: datetime
+    valid_to_max: datetime
+    revocation_counts: Counter
+
+

What this block is doing

One readable family of related certificates after grouping logic runs.

+

Flow arrows

Earlier blocks or operator input feed this block. → CertificateGroup → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## ScanStats + + + + + + +
+
@dataclass
+class ScanStats:
+    generated_at_utc: str
+    configured_domains: list[str]
+    unique_leaf_certificates: int
+    groups_total: int
+    groups_multi_member: int
+    groups_singleton: int
+    groups_by_type: dict[str, int]
+    verification: VerificationStats
+
+

What this block is doing

Top-level summary numbers used in reports.

+

Flow arrows

Earlier blocks or operator input feed this block. → ScanStats → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## IssuerTrustInfo + + + + + + +
+
@dataclass
+class IssuerTrustInfo:
+    issuer_name: str
+    issuer_ca_ids: set[int]
+    server_auth_contexts: set[str]
+    major_webpki: bool
+
+

What this block is doing

Stores the public-trust picture for one issuer family.

+

Flow arrows

Earlier blocks or operator input feed this block. → IssuerTrustInfo → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## load_domains + + + + + + +
+
def load_domains(path: Path) -> list[str]:
+    domains: list[str] = []
+    for raw_line in path.read_text(encoding="utf-8").splitlines():
+        line = raw_line.strip().lower()
+        if not line or line.startswith("#"):
+            continue
+        if line.startswith("*."):
+            line = line[2:]
+        domains.append(line)
+    unique_domains = sorted(set(domains))
+    if not unique_domains:
+        raise ValueError(f"No domains found in {path}")
+    return unique_domains
+
+

What this block is doing

This block loads data from disk, cache, or an earlier stage so later code can work with it.

+

Flow arrows

Operator's local config file. → load_domains → `query_domain` and the higher-level loaders use this cleaned domain list.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## escape_like + + + + + + +
+
def escape_like(value: str) -> str:
+    return value.replace("\\", "\\\\").replace("%", "\\%").replace("_", "\\_")
+
+

What this block is doing

This function is one of the building blocks inside `ct_scan.py`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine.

+

Flow arrows

Earlier blocks or operator input feed this block. → escape_like → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## utc_iso + + + + + + +
+
def utc_iso(value: datetime | None) -> str:
+    if value is None:
+        return "n/a"
+    if value.tzinfo is None:
+        value = value.replace(tzinfo=UTC)
+    else:
+        value = value.astimezone(UTC)
+    return value.isoformat(timespec="seconds").replace("+00:00", "Z")
+
+

What this block is doing

This is a small helper that keeps the larger analytical code cleaner and easier to reuse.

+

Flow arrows

Earlier blocks or operator input feed this block. → utc_iso → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## serialize_datetime + + + + + + +
+
def serialize_datetime(value: datetime | None) -> str | None:
+    return utc_iso(value) if value is not None else None
+
+

What this block is doing

This function is one of the building blocks inside `ct_scan.py`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine.

+

Flow arrows

Earlier blocks or operator input feed this block. → serialize_datetime → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## parse_datetime + + + + + + +
+
def parse_datetime(value: str | None) -> datetime | None:
+    if value is None:
+        return None
+    return datetime.fromisoformat(value.replace("Z", "+00:00")).astimezone(UTC).replace(tzinfo=None)
+
+

What this block is doing

This function is one of the building blocks inside `ct_scan.py`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine.

+

Flow arrows

Earlier blocks or operator input feed this block. → parse_datetime → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## cache_path + + + + + + +
+
def cache_path(cache_dir: Path, domain: str) -> Path:
+    safe_domain = "".join(ch if ch.isalnum() or ch in "-._" else "_" for ch in domain)
+    return cache_dir / f"{safe_domain}.json"
+
+

What this block is doing

This function is one of the building blocks inside `ct_scan.py`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine.

+

Flow arrows

Earlier blocks or operator input feed this block. → cache_path → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## record_to_cache_payload + + + + + + +
+
def record_to_cache_payload(record: DatabaseRecord) -> dict[str, Any]:
+    return {
+        "domain": record.domain,
+        "certificate_id": record.certificate_id,
+        "issuer_ca_id": record.issuer_ca_id,
+        "issuer_name": record.issuer_name,
+        "common_name": record.common_name,
+        "subject_dn": record.subject_dn,
+        "not_before": serialize_datetime(record.not_before),
+        "not_after": serialize_datetime(record.not_after),
+        "first_seen": serialize_datetime(record.first_seen),
+        "serial_number": record.serial_number,
+        "revoked_count": record.revoked_count,
+        "revocation_date": serialize_datetime(record.revocation_date),
+        "reason_code": record.reason_code,
+        "last_seen_check_date": serialize_datetime(record.last_seen_check_date),
+        "active_crl_count": record.active_crl_count,
+        "crl_last_checked": serialize_datetime(record.crl_last_checked),
+        "certificate_der_b64": base64.b64encode(record.certificate_der).decode("ascii"),
+    }
+
+

What this block is doing

This function is one of the building blocks inside `ct_scan.py`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine.

+

Flow arrows

Earlier blocks or operator input feed this block. → record_to_cache_payload → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## record_from_cache_payload + + + + + + +
+
def record_from_cache_payload(payload: dict[str, Any]) -> DatabaseRecord:
+    return DatabaseRecord(
+        domain=payload["domain"],
+        certificate_id=int(payload["certificate_id"]),
+        issuer_ca_id=int(payload["issuer_ca_id"]),
+        issuer_name=payload["issuer_name"],
+        common_name=payload.get("common_name"),
+        subject_dn=payload.get("subject_dn"),
+        not_before=parse_datetime(payload["not_before"]) or datetime.min,
+        not_after=parse_datetime(payload["not_after"]) or datetime.min,
+        first_seen=parse_datetime(payload.get("first_seen")),
+        serial_number=payload["serial_number"],
+        revoked_count=int(payload["revoked_count"]),
+        revocation_date=parse_datetime(payload.get("revocation_date")),
+        reason_code=payload.get("reason_code"),
+        last_seen_check_date=parse_datetime(payload.get("last_seen_check_date")),
+        active_crl_count=int(payload["active_crl_count"]),
+        crl_last_checked=parse_datetime(payload.get("crl_last_checked")),
+        certificate_der=base64.b64decode(payload["certificate_der_b64"]),
+    )
+
+

What this block is doing

This function is one of the building blocks inside `ct_scan.py`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine.

+

Flow arrows

Earlier blocks or operator input feed this block. → record_from_cache_payload → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## load_cached_records + + + + + + +
+
def load_cached_records(cache_dir: Path, domain: str, ttl_seconds: int, max_candidates: int) -> list[DatabaseRecord] | None:
+    path = cache_path(cache_dir, domain)
+    if not path.exists():
+        return None
+    try:
+        payload = json.loads(path.read_text(encoding="utf-8"))
+    except (json.JSONDecodeError, OSError):
+        return None
+    if payload.get("version") != 1:
+        return None
+    if payload.get("max_candidates") != max_candidates:
+        return None
+    cached_at = parse_datetime(payload.get("cached_at"))
+    if cached_at is None:
+        return None
+    age = time.time() - cached_at.replace(tzinfo=UTC).timestamp()
+    if age > ttl_seconds:
+        return None
+    return [record_from_cache_payload(item) for item in payload.get("records", [])]
+
+

What this block is doing

This block loads data from disk, cache, or an earlier stage so later code can work with it.

+

Flow arrows

Earlier blocks or operator input feed this block. → load_cached_records → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## store_cached_records + + + + + + +
+
def store_cached_records(cache_dir: Path, domain: str, max_candidates: int, records: list[DatabaseRecord]) -> None:
+    cache_dir.mkdir(parents=True, exist_ok=True)
+    payload = {
+        "version": 1,
+        "cached_at": utc_iso(datetime.now(UTC)),
+        "max_candidates": max_candidates,
+        "records": [record_to_cache_payload(record) for record in records],
+    }
+    cache_path(cache_dir, domain).write_text(
+        json.dumps(payload, indent=2, sort_keys=True),
+        encoding="utf-8",
+    )
+
+

What this block is doing

This block saves an intermediate result so the next run can reuse it instead of recomputing everything.

+

Flow arrows

Earlier blocks or operator input feed this block. → store_cached_records → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## connect + + + + + + +
+
def connect() -> psycopg.Connection:
+    return psycopg.connect(
+        host="crt.sh",
+        port=5432,
+        dbname="certwatch",
+        user="guest",
+        password="guest",
+        connect_timeout=5,
+        sslmode="disable",
+        autocommit=True,
+        application_name="ct_transparency_search",
+    )
+
+

What this block is doing

Opens the direct guest PostgreSQL connection to crt.sh's certwatch backend.

+

Flow arrows

Called by query functions that need live crt.sh data. → connect → `query_domain`, `query_raw_match_count`, and issuer-trust lookups all depend on this connection.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## query_domain + + + + + + +
+
def query_domain(domain: str, max_candidates: int, attempts: int, verbose: bool) -> list[DatabaseRecord]:
+    params = {
+        "domain": domain,
+        "name_pattern": f"%{escape_like(domain)}%",
+        "max_candidates": max_candidates,
+    }
+    raw_match_count = query_raw_match_count(domain=domain, attempts=attempts, verbose=verbose)
+    if raw_match_count > max_candidates:
+        raise ValueError(
+            f"domain={domain} raw identity matches={raw_match_count} exceed max_candidates={max_candidates}; "
+            f"increase --max-candidates-per-domain to at least {raw_match_count} for a complete result set"
+        )
+    last_error: Exception | None = None
+    for attempt in range(1, attempts + 1):
+        try:
+            with connect() as conn, conn.cursor(row_factory=dict_row) as cur:
+                cur.execute(QUERY_SQL, params)
+                rows = cur.fetchall()
+            return [row_to_record(domain, row) for row in rows]
+        except Exception as exc:
+            last_error = exc
+            if attempt == attempts:
+                break
+            if verbose:
+                print(
+                    f"[warn] domain={domain} attempt={attempt}/{attempts} failed: {exc}",
+                    file=sys.stderr,
+                )
+            time.sleep(min(2 ** attempt, 10))
+    assert last_error is not None
+    raise last_error
+
+

What this block is doing

Runs the main certificate query for one search term and refuses silent undercounting.

+

Flow arrows

A domain plus the safety cap and retry settings. → query_domain → `build_hits` receives the raw records returned here.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## query_raw_match_count + + + + + + +
+
def query_raw_match_count(domain: str, attempts: int, verbose: bool) -> int:
+    params = {
+        "domain": domain,
+        "name_pattern": f"%{escape_like(domain)}%",
+    }
+    last_error: Exception | None = None
+    for attempt in range(1, attempts + 1):
+        try:
+            with connect() as conn, conn.cursor() as cur:
+                cur.execute(RAW_MATCH_COUNT_SQL, params)
+                row = cur.fetchone()
+            return int(row[0])
+        except Exception as exc:
+            last_error = exc
+            if attempt == attempts:
+                break
+            if verbose:
+                print(
+                    f"[warn] domain={domain} raw-count attempt={attempt}/{attempts} failed: {exc}",
+                    file=sys.stderr,
+                )
+            time.sleep(min(2 ** attempt, 10))
+    assert last_error is not None
+    raise last_error
+
+

What this block is doing

Counts how many raw hits exist before the capped query runs.

+

Flow arrows

A domain string from the local config. → query_raw_match_count → `query_domain` uses this count to refuse silent undercounting.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## row_to_record + + + + + + +
+
def row_to_record(domain: str, row: dict[str, Any]) -> DatabaseRecord:
+    return DatabaseRecord(
+        domain=domain,
+        certificate_id=int(row["id"]),
+        issuer_ca_id=int(row["issuer_ca_id"]),
+        issuer_name=row["issuer_name"],
+        common_name=row["common_name"],
+        subject_dn=row["subject_dn"],
+        not_before=row["not_before"],
+        not_after=row["not_after"],
+        first_seen=row["first_seen"],
+        serial_number=row["serial_number"],
+        revoked_count=int(row["revoked_count"]),
+        revocation_date=row["revocation_date"],
+        reason_code=row["reason_code"],
+        last_seen_check_date=row["last_seen_check_date"],
+        active_crl_count=int(row["active_crl_count"] or 0),
+        crl_last_checked=row["crl_last_checked"],
+        certificate_der=bytes(row["certificate"]),
+    )
+
+

What this block is doing

This function is one of the building blocks inside `ct_scan.py`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine.

+

Flow arrows

Earlier blocks or operator input feed this block. → row_to_record → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## extract_san_entries + + + + + + +
+
def extract_san_entries(cert: x509.Certificate) -> list[str]:
+    try:
+        extension = cert.extensions.get_extension_for_class(x509.SubjectAlternativeName)
+    except x509.ExtensionNotFound:
+        return []
+    entries: list[str] = []
+    for name in extension.value:
+        entries.append(format_general_name(name))
+    return sorted(set(entries), key=str.casefold)
+
+

What this block is doing

This block pulls one specific piece of information out of a larger object.

+

Flow arrows

Earlier blocks or operator input feed this block. → extract_san_entries → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## format_general_name + + + + + + +
+
def format_general_name(name: general_name.GeneralName) -> str:
+    if isinstance(name, x509.DNSName):
+        return f"DNS:{name.value}"
+    if isinstance(name, x509.RFC822Name):
+        return f"EMAIL:{name.value}"
+    if isinstance(name, x509.UniformResourceIdentifier):
+        return f"URI:{name.value}"
+    if isinstance(name, x509.IPAddress):
+        return f"IP:{name.value}"
+    if isinstance(name, x509.RegisteredID):
+        return f"RID:{name.value.dotted_string}"
+    if isinstance(name, x509.DirectoryName):
+        return f"DIR:{name.value.rfc4514_string()}"
+    if isinstance(name, x509.OtherName):
+        encoded = base64.b64encode(name.value).decode("ascii")
+        return f"OTHER:{name.type_id.dotted_string}:{encoded}"
+    return str(name)
+
+

What this block is doing

This function is one of the building blocks inside `ct_scan.py`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine.

+

Flow arrows

Earlier blocks or operator input feed this block. → format_general_name → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## extract_common_name + + + + + + +
+
def extract_common_name(cert: x509.Certificate) -> str | None:
+    attributes = cert.subject.get_attributes_for_oid(NameOID.COMMON_NAME)
+    if not attributes:
+        return None
+    return attributes[0].value
+
+

What this block is doing

This block pulls one specific piece of information out of a larger object.

+

Flow arrows

Earlier blocks or operator input feed this block. → extract_common_name → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## has_precertificate_poison + + + + + + +
+
def has_precertificate_poison(cert: x509.Certificate) -> bool:
+    try:
+        cert.extensions.get_extension_for_oid(PRECERT_POISON_OID)
+    except x509.ExtensionNotFound:
+        return False
+    return True
+
+

What this block is doing

This function is one of the building blocks inside `ct_scan.py`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine.

+

Flow arrows

Earlier blocks or operator input feed this block. → has_precertificate_poison → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## is_leaf_certificate + + + + + + +
+
def is_leaf_certificate(cert: x509.Certificate) -> tuple[bool, str]:
+    if has_precertificate_poison(cert):
+        return (False, "precertificate_poison")
+    try:
+        basic_constraints = cert.extensions.get_extension_for_oid(ExtensionOID.BASIC_CONSTRAINTS).value
+        if basic_constraints.ca:
+            return (False, "basic_constraints_ca")
+    except x509.ExtensionNotFound:
+        pass
+    try:
+        key_usage = cert.extensions.get_extension_for_oid(ExtensionOID.KEY_USAGE).value
+        if key_usage.key_cert_sign:
+            return (False, "key_cert_sign")
+    except x509.ExtensionNotFound:
+        pass
+    return (True, "leaf")
+
+

What this block is doing

This function is one of the building blocks inside `ct_scan.py`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine.

+

Flow arrows

Earlier blocks or operator input feed this block. → is_leaf_certificate → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## revocation_fields + + + + + + +
+
def revocation_fields(record: DatabaseRecord) -> tuple[str, datetime | None, str | None, datetime | None, str | None]:
+    if record.revoked_count > 0:
+        reason: str | None = None
+        if record.reason_code in REVOCATION_REASONS:
+            reason = REVOCATION_REASONS[record.reason_code]
+        elif record.reason_code not in (None, 0):
+            reason = f"unknown({record.reason_code})"
+        return ("revoked", record.revocation_date, reason, record.last_seen_check_date, None)
+    if record.active_crl_count > 0:
+        return ("not_revoked", None, None, record.crl_last_checked, None)
+    return ("unknown", None, None, record.crl_last_checked, "no fresh crt.sh CRL data")
+
+

What this block is doing

This function is one of the building blocks inside `ct_scan.py`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine.

+

Flow arrows

Earlier blocks or operator input feed this block. → revocation_fields → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## revocation_priority + + + + + + +
+
def revocation_priority(status: str) -> int:
+    return {
+        "unknown": 0,
+        "not_revoked": 1,
+        "revoked": 2,
+    }[status]
+
+

What this block is doing

This function is one of the building blocks inside `ct_scan.py`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine.

+

Flow arrows

Earlier blocks or operator input feed this block. → revocation_priority → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## build_hits + + + + + + +
+
def build_hits(records: list[DatabaseRecord]) -> tuple[list[CertificateHit], VerificationStats]:
+    verification = VerificationStats(input_rows=len(records))
+    hits: dict[str, CertificateHit] = {}
+    for record in records:
+        cert = x509.load_der_x509_certificate(record.certificate_der)
+        is_leaf, reason = is_leaf_certificate(cert)
+        if not is_leaf:
+            if reason == "precertificate_poison":
+                verification.precertificate_poison_filtered += 1
+            else:
+                verification.non_leaf_filtered += 1
+            continue
+        fingerprint_hex = hashlib.sha256(record.certificate_der).hexdigest()
+        subject_cn = record.common_name or extract_common_name(cert) or "-"
+        revocation_status, revocation_date, revocation_reason, crtsh_crl_timestamp, revocation_note = revocation_fields(record)
+        hit = hits.get(fingerprint_hex)
+        if hit is None:
+            hit = CertificateHit(
+                fingerprint_sha256=fingerprint_hex,
+                subject_cn=subject_cn,
+                validity_not_before=record.not_before,
+                validity_not_after=record.not_after,
+                san_entries=extract_san_entries(cert),
+                revocation_status=revocation_status,
+                revocation_date=revocation_date,
+                revocation_reason=revocation_reason,
+                revocation_note=revocation_note,
+                crtsh_crl_timestamp=crtsh_crl_timestamp,
+                matched_domains={record.domain},
+                first_seen=record.first_seen,
+                crtsh_certificate_ids={record.certificate_id},
+                serial_numbers={record.serial_number},
+                issuer_names={record.issuer_name},
+                issuer_ca_ids={record.issuer_ca_id},
+            )
+            hits[fingerprint_hex] = hit
+            continue
+        hit.matched_domains.add(record.domain)
+        hit.crtsh_certificate_ids.add(record.certificate_id)
+        hit.serial_numbers.add(record.serial_number)
+        hit.issuer_names.add(record.issuer_name)
+        hit.issuer_ca_ids.add(record.issuer_ca_id)
+        if hit.first_seen is None or (record.first_seen is not None and record.first_seen < hit.first_seen):
+            hit.first_seen = record.first_seen
+        if revocation_priority(revocation_status) > revocation_priority(hit.revocation_status):
+            hit.revocation_status = revocation_status
+            hit.revocation_date = revocation_date
+            hit.revocation_reason = revocation_reason
+            hit.revocation_note = revocation_note
+            hit.crtsh_crl_timestamp = crtsh_crl_timestamp
+        elif revocation_status == hit.revocation_status and hit.crtsh_crl_timestamp is not None and crtsh_crl_timestamp is not None:
+            if crtsh_crl_timestamp > hit.crtsh_crl_timestamp:
+                hit.crtsh_crl_timestamp = crtsh_crl_timestamp
+        elif revocation_status == hit.revocation_status and hit.crtsh_crl_timestamp is None:
+            hit.crtsh_crl_timestamp = crtsh_crl_timestamp
+    ordered_hits = sorted(
+        hits.values(),
+        key=lambda hit: (
+            sorted(hit.matched_domains),
+            hit.subject_cn.casefold(),
+            hit.validity_not_before,
+            hit.fingerprint_sha256,
+        ),
+    )
+    verification.unique_leaf_certificates = len(ordered_hits)
+    return (ordered_hits, verification)
+
+

What this block is doing

Parses certificate bytes, rejects bad objects, and merges duplicate views of the same cert.

+

Flow arrows

Raw `DatabaseRecord` rows from crt.sh. → build_hits → `build_groups`, purpose analysis, DNS analysis, and CAA analysis all consume these cleaned hits.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## canonicalize_subject_cn + + + + + + +
+
def canonicalize_subject_cn(subject_cn: str) -> str:
+    subject_cn = subject_cn.lower()
+    if subject_cn.startswith("www."):
+        return subject_cn[4:]
+    return subject_cn
+
+

What this block is doing

This block makes values consistent so matching and grouping do not get confused by superficial differences.

+

Flow arrows

Earlier blocks or operator input feed this block. → canonicalize_subject_cn → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## normalize_counter_pattern + + + + + + +
+
def normalize_counter_pattern(hostname: str) -> str | None:
+    normalized = re.sub(r"\d+", "#", canonicalize_subject_cn(hostname))
+    if normalized == canonicalize_subject_cn(hostname):
+        return None
+    return normalized
+
+

What this block is doing

This block makes values consistent so matching and grouping do not get confused by superficial differences.

+

Flow arrows

Earlier blocks or operator input feed this block. → normalize_counter_pattern → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## UnionFind + + + + + + +
+
class UnionFind:
+    def __init__(self, size: int) -> None:
+        self.parent = list(range(size))
+        self.rank = [0] * size
+
+    def find(self, value: int) -> int:
+        while self.parent[value] != value:
+            self.parent[value] = self.parent[self.parent[value]]
+            value = self.parent[value]
+        return value
+
+    def union(self, left: int, right: int) -> None:
+        left_root = self.find(left)
+        right_root = self.find(right)
+        if left_root == right_root:
+            return
+        if self.rank[left_root] < self.rank[right_root]:
+            left_root, right_root = right_root, left_root
+        self.parent[right_root] = left_root
+        if self.rank[left_root] == self.rank[right_root]:
+            self.rank[left_root] += 1
+
+

What this block is doing

This class is a structured container for one piece of data that later code passes around instead of juggling many loose variables.

+

Flow arrows

Earlier blocks or operator input feed this block. → UnionFind → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## build_groups + + + + + + +
+
def build_groups(hits: list[CertificateHit]) -> list[CertificateGroup]:
+    if not hits:
+        return []
+    canonical_cns_by_pattern: dict[str, set[str]] = defaultdict(set)
+    for hit in hits:
+        pattern = normalize_counter_pattern(hit.subject_cn)
+        if pattern is not None:
+            canonical_cns_by_pattern[pattern].add(canonicalize_subject_cn(hit.subject_cn))
+
+    qualifying_patterns = {
+        pattern
+        for pattern, canonical_cns in canonical_cns_by_pattern.items()
+        if len(canonical_cns) > 1
+    }
+    components: dict[tuple[str, str], list[int]] = defaultdict(list)
+    for index, hit in enumerate(hits):
+        canonical_cn = canonicalize_subject_cn(hit.subject_cn)
+        pattern = normalize_counter_pattern(hit.subject_cn)
+        if pattern in qualifying_patterns:
+            components[("pattern", pattern)].append(index)
+        else:
+            components[("exact", canonical_cn)].append(index)
+
+    provisional_groups: list[CertificateGroup] = []
+    for (family_kind, family_key), member_indices in components.items():
+        member_hits = [hits[index] for index in member_indices]
+        subject_cns = {hit.subject_cn for hit in member_hits}
+        unique_san_profiles = {tuple(hit.san_entries) for hit in member_hits}
+        numbered_patterns = {family_key} if family_kind == "pattern" else set()
+        group_type = "numbered_cn_pattern" if family_kind == "pattern" else "exact_endpoint_family"
+        first_seen_values = [hit.first_seen for hit in member_hits if hit.first_seen is not None]
+        provisional_groups.append(
+            CertificateGroup(
+                group_id="",
+                group_type=group_type,
+                member_indices=sorted(member_indices),
+                member_count=len(member_indices),
+                distinct_subject_cn_count=len(subject_cns),
+                distinct_exact_content_count=len(unique_san_profiles),
+                numbered_cn_patterns=numbered_patterns,
+                matched_domains={domain for hit in member_hits for domain in hit.matched_domains},
+                subject_cns=subject_cns,
+                first_seen_min=min(first_seen_values) if first_seen_values else None,
+                first_seen_max=max(first_seen_values) if first_seen_values else None,
+                valid_from_min=min(hit.validity_not_before for hit in member_hits),
+                valid_to_max=max(hit.validity_not_after for hit in member_hits),
+                revocation_counts=Counter(hit.revocation_status for hit in member_hits),
+            )
+        )
+
+    provisional_groups.sort(
+        key=lambda group: (
+            -group.member_count,
+            group.group_type,
+            min(canonicalize_subject_cn(value) for value in group.subject_cns),
+        )
+    )
+    for position, group in enumerate(provisional_groups, start=1):
+        group.group_id = f"G{position:04d}"
+    return provisional_groups
+
+

What this block is doing

Turns a flat certificate list into CN-based families such as exact endpoints or numbered rails.

+

Flow arrows

The flat list of `CertificateHit` objects. → build_groups → The report builders use these groups to turn raw certificate clutter into readable families.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## describe_group_basis + + + + + + +
+
def describe_group_basis(group: CertificateGroup) -> str:
+    if group.group_type == "numbered_cn_pattern":
+        pattern = next(iter(group.numbered_cn_patterns))
+        return f"CN pattern with running-number slot: `{pattern}`"
+    base = min(canonicalize_subject_cn(value) for value in group.subject_cns)
+    return f"Same endpoint CN family (exact CN; `www.` grouped with base name): `{base}`"
+
+

What this block is doing

This function is one of the building blocks inside `ct_scan.py`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine.

+

Flow arrows

Earlier blocks or operator input feed this block. → describe_group_basis → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## primary_issuer_name + + + + + + +
+
def primary_issuer_name(hit: CertificateHit) -> str:
+    return sorted(hit.issuer_names)[0]
+
+

What this block is doing

This function is one of the building blocks inside `ct_scan.py`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine.

+

Flow arrows

Earlier blocks or operator input feed this block. → primary_issuer_name → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## query_issuer_trust + + + + + + +
+
def query_issuer_trust(hits: list[CertificateHit]) -> dict[str, IssuerTrustInfo]:
+    issuer_name_to_ca_ids: dict[str, set[int]] = defaultdict(set)
+    for hit in hits:
+        issuer_name_to_ca_ids[primary_issuer_name(hit)].update(hit.issuer_ca_ids)
+    all_ca_ids = sorted({ca_id for ca_ids in issuer_name_to_ca_ids.values() for ca_id in ca_ids})
+    contexts_by_ca_id: dict[int, set[str]] = defaultdict(set)
+    if all_ca_ids:
+        query = """
+        SELECT ctp.ca_id, tc.ctx
+        FROM ca_trust_purpose ctp
+        JOIN trust_context tc ON tc.id = ctp.trust_context_id
+        JOIN trust_purpose tp ON tp.id = ctp.trust_purpose_id
+        WHERE ctp.ca_id = ANY(%s)
+          AND tp.purpose = 'Server Authentication'
+          AND ctp.is_time_valid = TRUE
+          AND ctp.disabled_from IS NULL
+        """
+        with connect() as conn, conn.cursor() as cur:
+            cur.execute(query, (all_ca_ids,))
+            for ca_id, trust_context in cur.fetchall():
+                contexts_by_ca_id[int(ca_id)].add(str(trust_context))
+    major_contexts = {"Mozilla", "Chrome", "Apple", "Microsoft", "Android"}
+    results: dict[str, IssuerTrustInfo] = {}
+    for issuer_name, ca_ids in issuer_name_to_ca_ids.items():
+        merged_contexts = {ctx for ca_id in ca_ids for ctx in contexts_by_ca_id.get(ca_id, set())}
+        results[issuer_name] = IssuerTrustInfo(
+            issuer_name=issuer_name,
+            issuer_ca_ids=set(ca_ids),
+            server_auth_contexts=merged_contexts,
+            major_webpki=major_contexts.issubset(merged_contexts),
+        )
+    return results
+
+

What this block is doing

Checks which issuers are currently trusted for public TLS in the major WebPKI contexts.

+

Flow arrows

The cleaned current certificate hits. → query_issuer_trust → Report builders use this trust view in the certificate chapters and appendix tables.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## status_marker + + + + + + +
+
def status_marker(status: str) -> str:
+    return {
+        "not_revoked": "OK ",
+        "revoked": "REV",
+        "unknown": "UNK",
+    }[status]
+
+

What this block is doing

This function is one of the building blocks inside `ct_scan.py`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine.

+

Flow arrows

Earlier blocks or operator input feed this block. → status_marker → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## one_line_revocation + + + + + + +
+
def one_line_revocation(hit: CertificateHit) -> str:
+    if hit.revocation_status == "revoked":
+        detail = f"revoked {utc_iso(hit.revocation_date)}" if hit.revocation_date else "revoked"
+        if hit.revocation_reason:
+            detail += f", reason={hit.revocation_reason}"
+        return detail
+    if hit.revocation_status == "unknown":
+        if hit.revocation_note:
+            return f"unknown, {hit.revocation_note}"
+        return "unknown"
+    return "not revoked"
+
+

What this block is doing

This function is one of the building blocks inside `ct_scan.py`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine.

+

Flow arrows

Earlier blocks or operator input feed this block. → one_line_revocation → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## san_tail_split + + + + + + +
+
def san_tail_split(domain: str) -> tuple[list[str], str]:
+    labels = domain.split(".")
+    common_second_level = {"ac", "co", "com", "edu", "gov", "net", "org"}
+    suffix_len = 2
+    if len(labels) >= 3 and len(labels[-1]) == 2 and labels[-2] in common_second_level:
+        suffix_len = 3
+    if len(labels) <= suffix_len:
+        return ([], domain)
+    return (labels[:-suffix_len], ".".join(labels[-suffix_len:]))
+
+

What this block is doing

This function is one of the building blocks inside `ct_scan.py`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine.

+

Flow arrows

Earlier blocks or operator input feed this block. → san_tail_split → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## build_san_tree_lines + + + + + + +
+
def build_san_tree_lines(san_entries: list[str]) -> list[str]:
+    return build_san_tree_lines_with_style(san_entries, ascii_only=False)
+
+

What this block is doing

This block constructs a richer higher-level result from simpler inputs.

+

Flow arrows

Earlier blocks or operator input feed this block. → build_san_tree_lines → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## build_san_tree_units_with_style + + + + + + +
+
def build_san_tree_units_with_style(san_entries: list[str], ascii_only: bool) -> list[list[str]]:
+    dns_entries = sorted({entry[4:] for entry in san_entries if entry.startswith("DNS:")})
+    other_entries = sorted({entry for entry in san_entries if not entry.startswith("DNS:")})
+    tree: dict[str, Any] = {}
+    for domain in dns_entries:
+        prefix_labels, tail = san_tail_split(domain)
+        cursor = tree
+        for label in prefix_labels:
+            cursor = cursor.setdefault(label, {})
+        cursor.setdefault(tail, {})
+
+    def render(node: dict[str, Any], prefix: str = "") -> list[str]:
+        lines: list[str] = []
+        keys = sorted(node.keys(), key=str.casefold)
+        for index, key in enumerate(keys):
+            is_last = index == len(keys) - 1
+            if ascii_only:
+                connector = "`- " if is_last else "|- "
+            else:
+                connector = "└─ " if is_last else "├─ "
+            lines.append(prefix + connector + key)
+            child = node[key]
+            if ascii_only:
+                child_prefix = prefix + ("   " if is_last else "|  ")
+            else:
+                child_prefix = prefix + ("   " if is_last else "│  ")
+            lines.extend(render(child, child_prefix))
+        return lines
+
+    units: list[list[str]] = []
+    for key in sorted(tree.keys(), key=str.casefold):
+        units.append(render({key: tree[key]}))
+    for entry in other_entries:
+        units.append([f"{'*' if ascii_only else '•'} {entry}"])
+    if not units:
+        units.append([f"{'*' if ascii_only else '•'} -"])
+    return units
+
+

What this block is doing

This block constructs a richer higher-level result from simpler inputs.

+

Flow arrows

Earlier blocks or operator input feed this block. → build_san_tree_units_with_style → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## build_san_tree_chunks_with_style + + + + + + +
+
def build_san_tree_chunks_with_style(
+    san_entries: list[str],
+    ascii_only: bool,
+    max_lines_per_chunk: int = 24,
+) -> list[list[str]]:
+    chunks: list[list[str]] = []
+    current_chunk: list[str] = []
+    current_lines = 0
+
+    def flush_current_chunk() -> None:
+        nonlocal current_chunk, current_lines
+        if current_chunk:
+            chunks.append(current_chunk)
+            current_chunk = []
+            current_lines = 0
+
+    for unit in build_san_tree_units_with_style(san_entries, ascii_only=ascii_only):
+        if len(unit) > max_lines_per_chunk:
+            flush_current_chunk()
+            for start in range(0, len(unit), max_lines_per_chunk):
+                chunks.append(unit[start : start + max_lines_per_chunk])
+            continue
+        if current_chunk and current_lines + len(unit) > max_lines_per_chunk:
+            flush_current_chunk()
+        current_chunk.extend(unit)
+        current_lines += len(unit)
+
+    flush_current_chunk()
+    return chunks
+
+

What this block is doing

This block constructs a richer higher-level result from simpler inputs.

+

Flow arrows

Earlier blocks or operator input feed this block. → build_san_tree_chunks_with_style → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## build_san_tree_lines_with_style + + + + + + +
+
def build_san_tree_lines_with_style(san_entries: list[str], ascii_only: bool) -> list[str]:
+    lines: list[str] = []
+    for chunk in build_san_tree_chunks_with_style(
+        san_entries,
+        ascii_only=ascii_only,
+        max_lines_per_chunk=10_000,
+    ):
+        lines.extend(chunk)
+    return lines
+
+

What this block is doing

This block constructs a richer higher-level result from simpler inputs.

+

Flow arrows

Earlier blocks or operator input feed this block. → build_san_tree_lines_with_style → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## group_hits_by_issuer + + + + + + +
+
def group_hits_by_issuer(hits: list[CertificateHit]) -> tuple[dict[str, list[CertificateHit]], list[str]]:
+    issuer_hits: dict[str, list[CertificateHit]] = defaultdict(list)
+    for hit in hits:
+        issuer_hits[primary_issuer_name(hit)].append(hit)
+    ordered_issuers = sorted(
+        issuer_hits,
+        key=lambda issuer_name: (-len(issuer_hits[issuer_name]), issuer_name.casefold()),
+    )
+    return issuer_hits, ordered_issuers
+
+

What this block is doing

This block clusters related items together so later code can analyze them as families instead of as isolated rows.

+

Flow arrows

Earlier blocks or operator input feed this block. → group_hits_by_issuer → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## latex_escape + + + + + + +
+
def latex_escape(value: str) -> str:
+    replacements = {
+        "\\": r"\textbackslash{}",
+        "&": r"\&",
+        "%": r"\%",
+        "$": r"\$",
+        "#": r"\#",
+        "_": r"\_",
+        "{": r"\{",
+        "}": r"\}",
+        "~": r"\textasciitilde{}",
+        "^": r"\textasciicircum{}",
+    }
+    return "".join(replacements.get(char, char) for char in value)
+
+

What this block is doing

This function is one of the building blocks inside `ct_scan.py`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine.

+

Flow arrows

Earlier blocks or operator input feed this block. → latex_escape → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## summarize_san_patterns + + + + + + +
+
def summarize_san_patterns(san_entries: list[str]) -> dict[str, Any]:
+    dns_entries = sorted({entry[4:] for entry in san_entries if entry.startswith("DNS:")}, key=str.casefold)
+    other_entries = sorted({entry for entry in san_entries if not entry.startswith("DNS:")}, key=str.casefold)
+    zone_counts: Counter[str] = Counter()
+    normalized_pattern_counts: Counter[str] = Counter()
+    wildcard_count = 0
+    numbered_count = 0
+    for domain in dns_entries:
+        normalized_domain = domain[2:] if domain.startswith("*.") else domain
+        if domain.startswith("*."):
+            wildcard_count += 1
+        if re.search(r"\d", normalized_domain):
+            numbered_count += 1
+        prefix_labels, tail = san_tail_split(normalized_domain)
+        zone_counts[tail] += 1
+        normalized_prefix = ".".join(re.sub(r"\d+", "#", label) for label in prefix_labels if label)
+        if normalized_prefix:
+            normalized_pattern_counts[f"{normalized_prefix}.{tail}"] += 1
+        else:
+            normalized_pattern_counts[tail] += 1
+    repeating_patterns = [
+        (pattern, count)
+        for pattern, count in normalized_pattern_counts.most_common(6)
+        if count > 1
+    ]
+    return {
+        "dns_count": len(dns_entries),
+        "other_count": len(other_entries),
+        "wildcard_count": wildcard_count,
+        "numbered_count": numbered_count,
+        "zone_count": len(zone_counts),
+        "top_zones": zone_counts.most_common(6),
+        "repeating_patterns": repeating_patterns,
+    }
+
+

What this block is doing

This block compresses many detailed rows into a smaller, easier-to-read summary.

+

Flow arrows

Earlier blocks or operator input feed this block. → summarize_san_patterns → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## latex_status_badge + + + + + + +
+
def latex_status_badge(status: str) -> str:
+    return {
+        "not_revoked": r"\StatusOK{}",
+        "revoked": r"\StatusREV{}",
+        "unknown": r"\StatusUNK{}",
+    }[status]
+
+

What this block is doing

This function is one of the building blocks inside `ct_scan.py`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine.

+

Flow arrows

Earlier blocks or operator input feed this block. → latex_status_badge → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## latex_webpki_badge + + + + + + +
+
def latex_webpki_badge(value: bool) -> str:
+    return r"\WebPKIYes{}" if value else r"\WebPKINo{}"
+
+

What this block is doing

This function is one of the building blocks inside `ct_scan.py`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine.

+

Flow arrows

Earlier blocks or operator input feed this block. → latex_webpki_badge → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## render_markdown_report + + + + + + +
+
def render_markdown_report(
+    path: Path,
+    hits: list[CertificateHit],
+    groups: list[CertificateGroup],
+    stats: ScanStats,
+    issuer_trust: dict[str, IssuerTrustInfo],
+) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    issuer_hits, ordered_issuers = group_hits_by_issuer(hits)
+    lines: list[str] = []
+    lines.append("# Certificate CN Family Report")
+    lines.append("")
+    lines.append(f"Generated: {stats.generated_at_utc}")
+    lines.append(f"Configured domains: {', '.join(stats.configured_domains)}")
+    lines.append("")
+    lines.append("## What This File Contains")
+    lines.append("")
+    lines.append("- Chapters are built from Subject CN construction only.")
+    lines.append("- If multiple concrete CNs share the same numbered schema, they are grouped together.")
+    lines.append("- Otherwise the chapter is one endpoint family; `www.` is grouped with the base name as a low-signal convenience.")
+    lines.append("- SAN entries are shown only inside each Subject CN subsection.")
+    lines.append("- All certificates shown here are verified leaf certificates.")
+    lines.append("")
+    lines.append("## Issuer Overview")
+    lines.append("")
+    for issuer_name in ordered_issuers:
+        trust = issuer_trust[issuer_name]
+        ca_ids = ", ".join(str(value) for value in sorted(trust.issuer_ca_ids))
+        trust_label = "YES" if trust.major_webpki else "NO"
+        lines.append(
+            f"- {issuer_name} | certificates={len(issuer_hits[issuer_name])} | WebPKI server-auth in major stores={trust_label} | ca_id={ca_ids}"
+        )
+    lines.append("")
+    lines.append("## Leaf-Certificate Assurance")
+    lines.append("")
+    lines.append("- SQL filter: `certificate_lifecycle.certificate_type = 'Certificate'`")
+    lines.append("- Local filter: precertificate poison absent, `BasicConstraints.ca != true`, `KeyUsage.keyCertSign != true`")
+    lines.append(f"- Verified leaf certificates kept: {stats.unique_leaf_certificates}")
+    lines.append(f"- Non-leaf filtered after download: {stats.verification.non_leaf_filtered}")
+    lines.append(f"- Precertificate poison filtered after download: {stats.verification.precertificate_poison_filtered}")
+    lines.append("")
+    for issuer_position, issuer_name in enumerate(ordered_issuers, start=1):
+        trust = issuer_trust[issuer_name]
+        issuer_title = f"Issuer {issuer_position:02d}  {issuer_name}"
+        lines.append(f"## {issuer_title}")
+        lines.append("")
+        lines.append(f"- Certificates under issuer: {len(issuer_hits[issuer_name])}")
+        lines.append(
+            f"- WebPKI server-auth in major stores (Mozilla, Chrome, Apple, Microsoft, Android): {'YES' if trust.major_webpki else 'NO'}"
+        )
+        lines.append(
+            f"- Server-auth trust contexts seen in crt.sh live trust data: {', '.join(sorted(trust.server_auth_contexts)) if trust.server_auth_contexts else 'none'}"
+        )
+        lines.append(f"- Issuer CA IDs: {', '.join(str(value) for value in sorted(trust.issuer_ca_ids))}")
+        lines.append("")
+        issuer_groups = build_groups(issuer_hits[issuer_name])
+        for family_index, group in enumerate(issuer_groups, start=1):
+            member_hits = [issuer_hits[issuer_name][index] for index in group.member_indices]
+            chapter_title = f"Family {family_index:02d}  {describe_group_basis(group)}"
+            lines.append(f"### {chapter_title}")
+            lines.append("")
+            lines.append(f"- Certificates in chapter: {group.member_count}")
+            lines.append(f"- Concrete Subject CNs: {group.distinct_subject_cn_count}")
+            lines.append(f"- Distinct SAN profiles in chapter: {group.distinct_exact_content_count}")
+            lines.append(f"- Matched domains: {', '.join(sorted(group.matched_domains))}")
+            lines.append(f"- Family validity span: {utc_iso(group.valid_from_min)} -> {utc_iso(group.valid_to_max)}")
+            if group.first_seen_min and group.first_seen_max:
+                lines.append(f"- First seen span: {utc_iso(group.first_seen_min)} -> {utc_iso(group.first_seen_max)}")
+            lines.append(f"- Revocation mix: {group.revocation_counts.get('revoked', 0)} revoked, {group.revocation_counts.get('not_revoked', 0)} not revoked, {group.revocation_counts.get('unknown', 0)} unknown")
+            lines.append("")
+
+            hits_by_subject: dict[str, list[CertificateHit]] = defaultdict(list)
+            for hit in member_hits:
+                hits_by_subject[hit.subject_cn].append(hit)
+
+            ordered_subjects = sorted(
+                hits_by_subject.keys(),
+                key=lambda value: (canonicalize_subject_cn(value), value.casefold()),
+            )
+            for subject_cn in ordered_subjects:
+                subject_hits = sorted(
+                    hits_by_subject[subject_cn],
+                    key=lambda hit: (hit.validity_not_before, hit.validity_not_after, hit.fingerprint_sha256),
+                )
+                lines.append(f"#### Subject CN: `{subject_cn}`")
+                lines.append("")
+                lines.append(f"- Certificates under this CN: {len(subject_hits)}")
+                lines.append(f"- Validity span under this CN: {utc_iso(min(hit.validity_not_before for hit in subject_hits))} -> {utc_iso(max(hit.validity_not_after for hit in subject_hits))}")
+                san_profiles: dict[tuple[str, ...], list[CertificateHit]] = defaultdict(list)
+                for hit in subject_hits:
+                    san_profiles[tuple(hit.san_entries)].append(hit)
+                profile_size_counts = Counter(len(profile) for profile in san_profiles)
+                unique_san_entries = sorted({entry for hit in subject_hits for entry in hit.san_entries})
+                lines.append(f"- Distinct SAN profiles under this CN: {len(san_profiles)}")
+                lines.append(
+                    "- SAN profile sizes seen: "
+                    + ", ".join(
+                        f"{size} SAN x {count}"
+                        for size, count in sorted(profile_size_counts.items())
+                    )
+                )
+                lines.append("")
+                lines.append("Validity history")
+                lines.append("")
+
+                for hit in subject_hits:
+                    crtsh_ids = ", ".join(str(value) for value in sorted(hit.crtsh_certificate_ids))
+                    lines.append(
+                        f"- [{status_marker(hit.revocation_status)}] {utc_iso(hit.validity_not_before)} -> {utc_iso(hit.validity_not_after)} | SANs={len(hit.san_entries)} | crt.sh={crtsh_ids} | {one_line_revocation(hit)}"
+                    )
+                lines.append("")
+                lines.append("SAN structure")
+                lines.append("")
+                lines.append("```text")
+                for tree_line in build_san_tree_lines(unique_san_entries):
+                    lines.append(tree_line)
+                lines.append("```")
+                lines.append("")
+
+        lines.append("---")
+        lines.append("")
+
+    lines.append("## Statistics")
+    lines.append("")
+    lines.append(f"- Unique leaf certificates: {stats.unique_leaf_certificates}")
+    lines.append(f"- CN-family chapters: {stats.groups_total}")
+    lines.append(f"- Chapters with more than one certificate: {stats.groups_multi_member}")
+    lines.append(f"- Single-certificate chapters: {stats.groups_singleton}")
+    lines.append(f"- Numbered CN pattern chapters: {stats.groups_by_type.get('numbered_cn_pattern', 0)}")
+    lines.append(f"- Exact endpoint chapters: {stats.groups_by_type.get('exact_endpoint_family', 0)}")
+    lines.append("")
+    path.write_text("\n".join(lines) + "\n", encoding="utf-8")
+
+

What this block is doing

Writes the raw inventory appendix as readable Markdown.

+

Flow arrows

Current hits, groups, and trust data. → render_markdown_report → Produces the Markdown inventory appendix.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## render_latex_report + + + + + + +
+
def render_latex_report(
+    path: Path,
+    hits: list[CertificateHit],
+    groups: list[CertificateGroup],
+    stats: ScanStats,
+    issuer_trust: dict[str, IssuerTrustInfo],
+    show_page_numbers: bool = True,
+) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    issuer_hits, ordered_issuers = group_hits_by_issuer(hits)
+    revoked_total = sum(1 for hit in hits if hit.revocation_status == "revoked")
+    unknown_total = sum(1 for hit in hits if hit.revocation_status == "unknown")
+    not_revoked_total = sum(1 for hit in hits if hit.revocation_status == "not_revoked")
+
+    lines: list[str] = [
+        r"\documentclass[11pt]{article}",
+        r"\usepackage[a4paper,margin=18mm]{geometry}",
+        r"\usepackage{fontspec}",
+        r"\usepackage[table]{xcolor}",
+        r"\usepackage{microtype}",
+        r"\usepackage{hyperref}",
+        r"\usepackage{xurl}",
+        r"\usepackage{array}",
+        r"\usepackage{booktabs}",
+        r"\usepackage{tabularx}",
+        r"\usepackage{longtable}",
+        r"\usepackage{enumitem}",
+        r"\usepackage{titlesec}",
+        r"\usepackage[most]{tcolorbox}",
+        r"\usepackage{fancyvrb}",
+        r"\usepackage{needspace}",
+        r"\defaultfontfeatures{Ligatures=TeX,Scale=MatchLowercase}",
+        r"\definecolor{Ink}{HTML}{17202A}",
+        r"\definecolor{Muted}{HTML}{667085}",
+        r"\definecolor{Line}{HTML}{D0D5DD}",
+        r"\definecolor{Panel}{HTML}{F8FAFC}",
+        r"\definecolor{Accent}{HTML}{0F766E}",
+        r"\definecolor{AccentSoft}{HTML}{E6F4F1}",
+        r"\definecolor{AccentLine}{HTML}{74C4B8}",
+        r"\definecolor{Warn}{HTML}{9A6700}",
+        r"\definecolor{WarnSoft}{HTML}{FFF4DB}",
+        r"\definecolor{Danger}{HTML}{B42318}",
+        r"\definecolor{DangerSoft}{HTML}{FEE4E2}",
+        r"\definecolor{OkText}{HTML}{065F46}",
+        r"\definecolor{OkSoft}{HTML}{DCFCE7}",
+        r"\definecolor{UnknownText}{HTML}{9A6700}",
+        r"\definecolor{UnknownSoft}{HTML}{FEF3C7}",
+        r"\hypersetup{colorlinks=true,linkcolor=Accent,urlcolor=Accent,pdfauthor={CertTransparencySearch},pdftitle={Certificate Transparency Endpoint Atlas}}",
+        r"\setlength{\parindent}{0pt}",
+        r"\setlength{\parskip}{6pt}",
+        r"\setlength{\emergencystretch}{3em}",
+        r"\setlength{\footskip}{24pt}",
+        r"\setlength{\tabcolsep}{4.2pt}",
+        r"\renewcommand{\arraystretch}{1.12}",
+        r"\raggedbottom",
+        r"\setcounter{tocdepth}{2}",
+        rf"\pagestyle{{{'plain' if show_page_numbers else 'empty'}}}",
+        r"\titleformat{\section}{\sffamily\bfseries\LARGE\color{Ink}\raggedright}{\thesection}{0.8em}{}",
+        r"\titleformat{\subsection}{\sffamily\bfseries\Large\color{Ink}\raggedright}{\thesubsection}{0.8em}{}",
+        r"\titleformat{\subsubsection}{\sffamily\bfseries\normalsize\color{Ink}\raggedright}{\thesubsubsection}{0.8em}{}",
+        r"\tcbset{",
+        r"  panel/.style={enhanced,breakable,boxrule=0.55pt,arc=3pt,left=9pt,right=9pt,top=8pt,bottom=8pt,colback=white,colframe=Line},",
+        r"  hero/.style={panel,colback=Ink,colframe=Ink,left=14pt,right=14pt,top=14pt,bottom=14pt},",
+        r"  summary/.style={panel,colback=Panel,colframe=Line},",
+        r"  issuerpanel/.style={panel,colback=Panel,colframe=Ink!45},",
+        r"  familypanel/.style={panel,colback=AccentSoft,colframe=AccentLine},",
+        r"  subjectpanel/.style={panel,colback=white,colframe=Line},",
+        r"  treepanel/.style={enhanced,boxrule=0.55pt,arc=3pt,left=9pt,right=9pt,top=8pt,bottom=8pt,colback=Panel,colframe=AccentLine},",
+        r"}",
+        r"\newcommand{\DomainChip}[1]{\tcbox[on line,boxrule=0pt,arc=3pt,left=5pt,right=5pt,top=2pt,bottom=2pt,colback=AccentSoft]{\sffamily\footnotesize\texttt{#1}}}",
+        r"\newcommand{\MetricChip}[2]{\tcbox[on line,boxrule=0pt,arc=3pt,left=6pt,right=6pt,top=3pt,bottom=3pt,colback=Panel]{\sffamily\footnotesize\textcolor{Muted}{#1}\hspace{0.45em}\textbf{#2}}}",
+        r"\newcommand{\StatusOK}{\tcbox[on line,boxrule=0pt,arc=3pt,left=5pt,right=5pt,top=2pt,bottom=2pt,colback=OkSoft]{\sffamily\bfseries\footnotesize\textcolor{OkText}{OK}}}",
+        r"\newcommand{\StatusREV}{\tcbox[on line,boxrule=0pt,arc=3pt,left=5pt,right=5pt,top=2pt,bottom=2pt,colback=DangerSoft]{\sffamily\bfseries\footnotesize\textcolor{Danger}{REV}}}",
+        r"\newcommand{\StatusUNK}{\tcbox[on line,boxrule=0pt,arc=3pt,left=5pt,right=5pt,top=2pt,bottom=2pt,colback=UnknownSoft]{\sffamily\bfseries\footnotesize\textcolor{UnknownText}{UNK}}}",
+        r"\newcommand{\WebPKIYes}{\tcbox[on line,boxrule=0pt,arc=3pt,left=5pt,right=5pt,top=2pt,bottom=2pt,colback=OkSoft]{\sffamily\bfseries\footnotesize\textcolor{OkText}{WebPKI: YES}}}",
+        r"\newcommand{\WebPKINo}{\tcbox[on line,boxrule=0pt,arc=3pt,left=5pt,right=5pt,top=2pt,bottom=2pt,colback=DangerSoft]{\sffamily\bfseries\footnotesize\textcolor{Danger}{WebPKI: NO}}}",
+        r"\begin{document}",
+        r"\begin{titlepage}",
+        r"\thispagestyle{empty}",
+        r"\vspace*{20mm}",
+        r"\begin{tcolorbox}[hero]",
+        r"{\color{white}\sffamily\bfseries\fontsize{24}{28}\selectfont Certificate Transparency Endpoint Atlas\par}",
+        r"\vspace{4pt}",
+        r"{\color{white}\Large Currently valid leaf certificates matching the configured domains\par}",
+        r"\vspace{12pt}",
+        r"{\color{white}\sffamily\small This artefact is optimized for review: issuer-first navigation, CN-family grouping, certificate timelines, and SAN structure blocks designed to be read rather than decoded.}",
+        r"\end{tcolorbox}",
+        r"\vspace{10mm}",
+        r"\begin{tcolorbox}[summary]",
+        rf"\textbf{{Generated}}: {latex_escape(stats.generated_at_utc)}\par",
+        r"\textbf{Configured domains}: " + " ".join(
+            rf"\DomainChip{{{latex_escape(domain)}}}" for domain in stats.configured_domains
+        ),
+        r"\par\medskip",
+        r"\MetricChip{Leaf certificates}{" + str(stats.unique_leaf_certificates) + r"}" + " "
+        + r"\MetricChip{CN families}{" + str(stats.groups_total) + r"}" + " "
+        + r"\MetricChip{Numbered families}{" + str(stats.groups_by_type.get("numbered_cn_pattern", 0)) + r"}" + " "
+        + r"\MetricChip{Exact families}{" + str(stats.groups_by_type.get("exact_endpoint_family", 0)) + r"}",
+        r"\par\medskip",
+        r"\MetricChip{Not revoked}{" + str(not_revoked_total) + r"}" + " "
+        + r"\MetricChip{Revoked}{" + str(revoked_total) + r"}" + " "
+        + r"\MetricChip{Unknown}{" + str(unknown_total) + r"}",
+        r"\end{tcolorbox}",
+        r"\vfill",
+        r"{\sffamily\small\textcolor{Muted}{Same scan, three outputs: Markdown for editor preview, LaTeX for source control, PDF for distribution.}}",
+        r"\end{titlepage}",
+        r"\tableofcontents",
+        r"\clearpage",
+        r"\section*{Executive Summary}",
+        r"\addcontentsline{toc}{section}{Executive Summary}",
+        r"\begin{tcolorbox}[summary]",
+        r"\textbf{Reading guide}\par",
+        r"Major chapters are exact issuer names. Inside each issuer, families are derived only from the construction of the Subject CN. Each concrete Subject CN then gets its own certificate timeline and a SAN structure panel.\par",
+        r"\medskip",
+        r"\textbf{Leaf-only assurance}\par",
+        r"SQL excludes entries whose lifecycle type is not \texttt{Certificate}. Local parsing then rejects any artifact with precertificate poison, \texttt{BasicConstraints.ca = true}, or \texttt{KeyUsage.keyCertSign = true}.",
+        r"\end{tcolorbox}",
+        r"\begin{tcolorbox}[summary]",
+        r"\textbf{Issuer landscape}\par",
+        r"\medskip",
+        r"\begin{tabularx}{\linewidth}{>{\raggedright\arraybackslash}X >{\raggedleft\arraybackslash}p{1.7cm} >{\raggedleft\arraybackslash}p{1.9cm} >{\raggedleft\arraybackslash}p{2.0cm}}",
+        r"\toprule",
+        r"Issuer & Certificates & Share & WebPKI \\",
+        r"\midrule",
+    ]
+
+    total_hits = len(hits) if hits else 1
+    for issuer_name in ordered_issuers:
+        issuer_count = len(issuer_hits[issuer_name])
+        share = f"{issuer_count / total_hits:.1%}"
+        lines.append(
+            rf"{latex_escape(issuer_name)} & {issuer_count} & {latex_escape(share)} & {latex_webpki_badge(issuer_trust[issuer_name].major_webpki)} \\"
+        )
+    lines.extend(
+        [
+            r"\bottomrule",
+            r"\end{tabularx}",
+            r"\end{tcolorbox}",
+        ]
+    )
+
+    for issuer_position, issuer_name in enumerate(ordered_issuers, start=1):
+        trust = issuer_trust[issuer_name]
+        issuer_groups = build_groups(issuer_hits[issuer_name])
+        lines.extend(
+            [
+                r"\clearpage",
+                rf"\section{{Issuer {issuer_position:02d}: {latex_escape(issuer_name)}}}",
+                r"\begin{tcolorbox}[issuerpanel]",
+                r"\MetricChip{Certificates}{" + str(len(issuer_hits[issuer_name])) + r"}" + " "
+                + r"\MetricChip{Families}{" + str(len(issuer_groups)) + r"}" + " "
+                + latex_webpki_badge(trust.major_webpki),
+                r"\par\medskip",
+                rf"\textbf{{Trust contexts seen in crt.sh live data}}: {latex_escape(', '.join(sorted(trust.server_auth_contexts)) if trust.server_auth_contexts else 'none')}\par",
+                rf"\textbf{{Issuer CA IDs}}: {latex_escape(', '.join(str(value) for value in sorted(trust.issuer_ca_ids)))}",
+                r"\end{tcolorbox}",
+            ]
+        )
+        for family_index, group in enumerate(issuer_groups, start=1):
+            member_hits = [issuer_hits[issuer_name][index] for index in group.member_indices]
+            lines.extend(
+                [
+                    r"\Needspace{14\baselineskip}",
+                    rf"\subsection{{Family {family_index:02d}: {latex_escape(describe_group_basis(group).replace('`', ''))}}}",
+                    r"\begin{tcolorbox}[familypanel]",
+                    r"\MetricChip{Certificates}{" + str(group.member_count) + r"}" + " "
+                    + r"\MetricChip{Concrete CNs}{" + str(group.distinct_subject_cn_count) + r"}" + " "
+                    + r"\MetricChip{Distinct SAN profiles}{" + str(group.distinct_exact_content_count) + r"}",
+                    r"\par\medskip",
+                    rf"\textbf{{Matched domains}}: {' '.join(rf'\DomainChip{{{latex_escape(domain)}}}' for domain in sorted(group.matched_domains))}\par",
+                    rf"\textbf{{Family validity span}}: \texttt{{{latex_escape(utc_iso(group.valid_from_min))}}} to \texttt{{{latex_escape(utc_iso(group.valid_to_max))}}}\par",
+                    (
+                        rf"\textbf{{First seen span}}: \texttt{{{latex_escape(utc_iso(group.first_seen_min))}}} to \texttt{{{latex_escape(utc_iso(group.first_seen_max))}}}\par"
+                        if group.first_seen_min and group.first_seen_max
+                        else ""
+                    ),
+                    rf"\textbf{{Revocation mix}}: {group.revocation_counts.get('revoked', 0)} revoked, {group.revocation_counts.get('not_revoked', 0)} not revoked, {group.revocation_counts.get('unknown', 0)} unknown",
+                    r"\end{tcolorbox}",
+                ]
+            )
+
+            hits_by_subject: dict[str, list[CertificateHit]] = defaultdict(list)
+            for hit in member_hits:
+                hits_by_subject[hit.subject_cn].append(hit)
+            ordered_subjects = sorted(
+                hits_by_subject.keys(),
+                key=lambda value: (canonicalize_subject_cn(value), value.casefold()),
+            )
+            for subject_cn in ordered_subjects:
+                subject_hits = sorted(
+                    hits_by_subject[subject_cn],
+                    key=lambda hit: (hit.validity_not_before, hit.validity_not_after, hit.fingerprint_sha256),
+                )
+                san_summary = summarize_san_patterns(sorted({entry for hit in subject_hits for entry in hit.san_entries}))
+                unique_san_entries = sorted({entry for hit in subject_hits for entry in hit.san_entries})
+                lines.extend(
+                    [
+                        r"\Needspace{18\baselineskip}",
+                        rf"\subsubsection{{Subject CN: {latex_escape(subject_cn)}}}",
+                        r"\begin{tcolorbox}[subjectpanel]",
+                        r"\MetricChip{Certificates under this CN}{" + str(len(subject_hits)) + r"}" + " "
+                        + r"\MetricChip{Distinct SAN profiles}{" + str(len({tuple(hit.san_entries) for hit in subject_hits})) + r"}" + " "
+                        + r"\MetricChip{Unique SAN entries}{" + str(len(unique_san_entries)) + r"}",
+                        r"\par\medskip",
+                        rf"\textbf{{Validity span under this CN}}: \texttt{{{latex_escape(utc_iso(min(hit.validity_not_before for hit in subject_hits)))}}} to \texttt{{{latex_escape(utc_iso(max(hit.validity_not_after for hit in subject_hits)))}}}",
+                        r"\par\medskip",
+                        r"\textbf{Certificate timeline}",
+                        r"\begin{itemize}[leftmargin=1.4em,itemsep=0.55em,topsep=0.4em]",
+                    ]
+                )
+                for hit in subject_hits:
+                    crtsh_ids = ", ".join(str(value) for value in sorted(hit.crtsh_certificate_ids))
+                    lines.extend(
+                        [
+                            r"\item "
+                            + latex_status_badge(hit.revocation_status)
+                            + " "
+                            + rf"\texttt{{{latex_escape(utc_iso(hit.validity_not_before))}}} to \texttt{{{latex_escape(utc_iso(hit.validity_not_after))}}}",
+                            rf"\newline \textcolor{{Muted}}{{SANs: {len(hit.san_entries)} \quad crt.sh: {latex_escape(crtsh_ids)} \quad {latex_escape(one_line_revocation(hit))}}}",
+                        ]
+                    )
+                tree_chunks = build_san_tree_chunks_with_style(
+                    unique_san_entries,
+                    ascii_only=True,
+                    max_lines_per_chunk=24,
+                )
+                lines.extend(
+                    [
+                        r"\end{itemize}",
+                        r"\medskip",
+                        r"\textbf{SAN pattern snapshot}",
+                        r"\par\medskip",
+                        r"\MetricChip{DNS SANs}{" + str(san_summary["dns_count"]) + r"}" + " "
+                        + r"\MetricChip{Other SANs}{" + str(san_summary["other_count"]) + r"}" + " "
+                        + r"\MetricChip{Wildcard SANs}{" + str(san_summary["wildcard_count"]) + r"}" + " "
+                        + r"\MetricChip{Numbered SANs}{" + str(san_summary["numbered_count"]) + r"}" + " "
+                        + r"\MetricChip{DNS zones}{" + str(san_summary["zone_count"]) + r"}",
+                        r"\par\medskip",
+                        rf"\textbf{{Dominant zones}}: {latex_escape(', '.join(f'{zone} ({count})' for zone, count in san_summary['top_zones']) if san_summary['top_zones'] else 'none')}",
+                        r"\par",
+                        rf"\textbf{{Repeating host schemas}}: {latex_escape(', '.join(f'{pattern} ({count})' for pattern, count in san_summary['repeating_patterns']) if san_summary['repeating_patterns'] else 'mostly one-off SAN hostnames')}",
+                        (
+                            rf"\par\medskip\textcolor{{Muted}}{{The SAN structure below is shown in {len(tree_chunks)} intact panels so the visual grouping is not broken across a page.}}"
+                            if len(tree_chunks) > 1
+                            else ""
+                        ),
+                        r"\end{tcolorbox}",
+                    ]
+                )
+                for tree_chunk_index, tree_lines in enumerate(tree_chunks, start=1):
+                    tree_title = (
+                        "SAN Structure"
+                        if len(tree_chunks) == 1
+                        else f"SAN Structure ({tree_chunk_index}/{len(tree_chunks)})"
+                    )
+                    tree_needspace = max(12, min(len(tree_lines) + 7, 32))
+                    lines.extend(
+                        [
+                            rf"\Needspace{{{tree_needspace}\baselineskip}}",
+                            rf"\begin{{tcolorbox}}[treepanel,title={{{latex_escape(tree_title)}}}]",
+                            r"\begin{Verbatim}[fontsize=\footnotesize]",
+                        ]
+                    )
+                    lines.extend(tree_lines)
+                    lines.extend(
+                        [
+                            r"\end{Verbatim}",
+                            r"\end{tcolorbox}",
+                        ]
+                    )
+
+    lines.extend(
+        [
+            r"\clearpage",
+            r"\section*{Statistics}",
+            r"\addcontentsline{toc}{section}{Statistics}",
+            r"\begin{tcolorbox}[summary]",
+            r"\MetricChip{Unique leaf certificates}{" + str(stats.unique_leaf_certificates) + r"}" + " "
+            + r"\MetricChip{CN-family chapters}{" + str(stats.groups_total) + r"}" + " "
+            + r"\MetricChip{Multi-certificate chapters}{" + str(stats.groups_multi_member) + r"}" + " "
+            + r"\MetricChip{Singleton chapters}{" + str(stats.groups_singleton) + r"}",
+            r"\par\medskip",
+            r"\MetricChip{Numbered CN patterns}{" + str(stats.groups_by_type.get("numbered_cn_pattern", 0)) + r"}" + " "
+            + r"\MetricChip{Exact endpoint families}{" + str(stats.groups_by_type.get("exact_endpoint_family", 0)) + r"}" + " "
+            + r"\MetricChip{Non-leaf filtered}{" + str(stats.verification.non_leaf_filtered) + r"}" + " "
+            + r"\MetricChip{Precert poison filtered}{" + str(stats.verification.precertificate_poison_filtered) + r"}",
+            r"\end{tcolorbox}",
+            r"\end{document}",
+        ]
+    )
+    path.write_text("\n".join(line for line in lines if line != "") + "\n", encoding="utf-8")
+
+

What this block is doing

Writes the raw inventory appendix as LaTeX for PDF assembly.

+

Flow arrows

Current hits, groups, and trust data. → render_latex_report → Produces the LaTeX appendix source that later becomes PDF.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## cleanup_latex_auxiliary_files + + + + + + +
+
def cleanup_latex_auxiliary_files(tex_path: Path, pdf_output: Path) -> None:
+    generated_base = pdf_output.parent / tex_path.stem
+    for suffix in (".aux", ".log", ".out", ".toc"):
+        candidate = generated_base.with_suffix(suffix)
+        if candidate.exists():
+            candidate.unlink()
+
+

What this block is doing

This function is one of the building blocks inside `ct_scan.py`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine.

+

Flow arrows

Earlier blocks or operator input feed this block. → cleanup_latex_auxiliary_files → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## compile_latex_to_pdf + + + + + + +
+
def compile_latex_to_pdf(tex_path: Path, pdf_output: Path, engine: str) -> None:
+    engine_path = shutil.which(engine)
+    if engine_path is None:
+        raise RuntimeError(f"LaTeX engine not found: {engine}")
+    tex_path = tex_path.resolve()
+    pdf_output = pdf_output.resolve()
+    pdf_output.parent.mkdir(parents=True, exist_ok=True)
+    compile_cmd = [
+        engine_path,
+        "-interaction=nonstopmode",
+        "-halt-on-error",
+        "-output-directory",
+        str(pdf_output.parent),
+        str(tex_path),
+    ]
+    for _ in range(2):
+        result = subprocess.run(
+            compile_cmd,
+            capture_output=True,
+            text=True,
+            check=False,
+        )
+        if result.returncode != 0:
+            message = (result.stdout + "\n" + result.stderr).strip()
+            raise RuntimeError(
+                "LaTeX compilation failed.\n"
+                + "\n".join(message.splitlines()[-40:])
+            )
+    generated_pdf = pdf_output.parent / f"{tex_path.stem}.pdf"
+    if generated_pdf != pdf_output:
+        generated_pdf.replace(pdf_output)
+    cleanup_latex_auxiliary_files(tex_path, pdf_output)
+
+

What this block is doing

Hands LaTeX to XeLaTeX and turns it into a finished PDF file.

+

Flow arrows

A finished `.tex` file. → compile_latex_to_pdf → Produces the human-readable PDF artifact.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## parse_args + + + + + + +
+
def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Search crt.sh for currently valid certificates matching configured domain fragments.",
+    )
+    parser.add_argument(
+        "--domains-file",
+        type=Path,
+        default=Path("domains.local.txt"),
+        help="Text file containing one domain fragment per line.",
+    )
+    parser.add_argument(
+        "--output",
+        type=Path,
+        default=Path("output/current-valid-certificates.md"),
+        help="Readable single-file markdown report to write.",
+    )
+    parser.add_argument(
+        "--latex-output",
+        type=Path,
+        default=Path("output/current-valid-certificates.tex"),
+        help="Readable single-file LaTeX report to write.",
+    )
+    parser.add_argument(
+        "--pdf-output",
+        type=Path,
+        default=Path("output/current-valid-certificates.pdf"),
+        help="Compiled PDF report to write.",
+    )
+    parser.add_argument(
+        "--pdf-engine",
+        default="xelatex",
+        help="LaTeX engine used to compile the PDF report.",
+    )
+    parser.add_argument(
+        "--skip-pdf",
+        action="store_true",
+        help="Write Markdown and LaTeX outputs but skip PDF compilation.",
+    )
+    parser.add_argument(
+        "--cache-dir",
+        type=Path,
+        default=Path(".cache/ct-search"),
+        help="Directory for cached per-domain query results.",
+    )
+    parser.add_argument(
+        "--cache-ttl-seconds",
+        type=int,
+        default=900,
+        help="Reuse cached database results younger than this many seconds.",
+    )
+    parser.add_argument(
+        "--max-candidates-per-domain",
+        type=int,
+        default=10000,
+        help="Maximum raw crt.sh identity rows to inspect per domain fragment.",
+    )
+    parser.add_argument(
+        "--retries",
+        type=int,
+        default=3,
+        help="Retry count for replica/recovery conflicts from crt.sh.",
+    )
+    parser.add_argument(
+        "--quiet",
+        action="store_true",
+        help="Suppress progress output.",
+    )
+    return parser.parse_args()
+
+

What this block is doing

This block defines the command-line knobs for the file: input paths, cache settings, output paths, and other runtime switches.

+

Flow arrows

Earlier blocks or operator input feed this block. → parse_args → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## main + + + + + + +
+
def main() -> int:
+    args = parse_args()
+    domains = load_domains(args.domains_file)
+    all_records: list[DatabaseRecord] = []
+    for domain in domains:
+        cached = load_cached_records(
+            cache_dir=args.cache_dir,
+            domain=domain,
+            ttl_seconds=args.cache_ttl_seconds,
+            max_candidates=args.max_candidates_per_domain,
+        )
+        if cached is not None:
+            if not args.quiet:
+                print(f"[cache] domain={domain} records={len(cached)}", file=sys.stderr)
+            all_records.extend(cached)
+            continue
+        if not args.quiet:
+            print(f"[query] domain={domain}", file=sys.stderr)
+        records = query_domain(
+            domain=domain,
+            max_candidates=args.max_candidates_per_domain,
+            attempts=args.retries,
+            verbose=not args.quiet,
+        )
+        if not args.quiet:
+            print(f"[done] domain={domain} records={len(records)}", file=sys.stderr)
+        store_cached_records(args.cache_dir, domain, args.max_candidates_per_domain, records)
+        all_records.extend(records)
+    hits, verification = build_hits(all_records)
+    groups = build_groups(hits)
+    scan_stats = ScanStats(
+        generated_at_utc=utc_iso(datetime.now(UTC)),
+        configured_domains=domains,
+        unique_leaf_certificates=len(hits),
+        groups_total=len(groups),
+        groups_multi_member=sum(1 for group in groups if group.member_count > 1),
+        groups_singleton=sum(1 for group in groups if group.member_count == 1),
+        groups_by_type=dict(Counter(group.group_type for group in groups)),
+        verification=verification,
+    )
+    issuer_trust = query_issuer_trust(hits)
+    render_markdown_report(args.output, hits, groups, scan_stats, issuer_trust)
+    render_latex_report(args.latex_output, hits, groups, scan_stats, issuer_trust)
+    if not args.skip_pdf:
+        compile_latex_to_pdf(args.latex_output, args.pdf_output, args.pdf_engine)
+    if not args.quiet:
+        print(
+            f"[report] hits={len(hits)} groups={len(groups)} markdown={args.output} latex={args.latex_output}"
+            + ("" if args.skip_pdf else f" pdf={args.pdf_output}"),
+            file=sys.stderr,
+        )
+    return 0
+
+

What this block is doing

The standalone command-line entrypoint for the inventory scanner.

+

Flow arrows

CLI arguments from the operator. → main → Runs the whole scanner end to end.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ diff --git a/teachingNoobs/ct_usage_assessment.md b/teachingNoobs/ct_usage_assessment.md new file mode 100644 index 0000000..16c88e1 --- /dev/null +++ b/teachingNoobs/ct_usage_assessment.md @@ -0,0 +1,645 @@ +# ct_usage_assessment.py + +Source file: [`ct_usage_assessment.py`](../ct_usage_assessment.py) + +Certificate-purpose analyzer. This file looks at EKU and KeyUsage to decide what each certificate is technically allowed to do. + +Main flow in one line: `certificate bytes -> EKU and KeyUsage -> purpose label -> summary counts` + +How to read this page: + +- left side: the actual source code block +- right side: a plain-English explanation for a beginner +- read from top to bottom because later blocks depend on earlier ones + +## Module setup + + + + + + +
+
#!/usr/bin/env python3
+
+from __future__ import annotations
+
+import argparse
+import hashlib
+import json
+from collections import Counter, defaultdict
+from dataclasses import asdict, dataclass
+from datetime import UTC, datetime
+from pathlib import Path
+
+from cryptography import x509
+from cryptography.x509.oid import ExtensionOID
+
+import ct_scan
+
+
+SERVER_AUTH_OID = "1.3.6.1.5.5.7.3.1"
+CLIENT_AUTH_OID = "1.3.6.1.5.5.7.3.2"
+CODE_SIGNING_OID = "1.3.6.1.5.5.7.3.3"
+EMAIL_PROTECTION_OID = "1.3.6.1.5.5.7.3.4"
+TIME_STAMPING_OID = "1.3.6.1.5.5.7.3.8"
+OCSP_SIGNING_OID = "1.3.6.1.5.5.7.3.9"
+ANY_EXTENDED_KEY_USAGE_OID = "2.5.29.37.0"
+
+EKU_LABELS = {
+    SERVER_AUTH_OID: "serverAuth",
+    CLIENT_AUTH_OID: "clientAuth",
+    CODE_SIGNING_OID: "codeSigning",
+    EMAIL_PROTECTION_OID: "emailProtection",
+    TIME_STAMPING_OID: "timeStamping",
+    OCSP_SIGNING_OID: "OCSPSigning",
+    ANY_EXTENDED_KEY_USAGE_OID: "anyExtendedKeyUsage",
+}
+
+

What this block is doing

Purpose-analysis constants and small data shapes for EKU and KeyUsage classification.

+

Flow arrows

Earlier blocks or operator input feed this block. → Module setup → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## PurposeClassification + + + + + + +
+
@dataclass
+class PurposeClassification:
+    fingerprint_sha256: str
+    subject_cn: str
+    issuer_name: str
+    category: str
+    eku_oids: list[str]
+    key_usage_flags: list[str]
+    valid_from_utc: str
+    valid_to_utc: str
+    matched_domains: list[str]
+    san_dns_names: list[str]
+
+

What this block is doing

One certificate plus the usage label assigned to it.

+

Flow arrows

Earlier blocks or operator input feed this block. → PurposeClassification → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## AssessmentSummary + + + + + + +
+
@dataclass
+class AssessmentSummary:
+    generated_at_utc: str
+    source_cache_domains: list[str]
+    unique_leaf_certificates: int
+    category_counts: dict[str, int]
+    eku_templates: dict[str, int]
+    key_usage_templates: dict[str, int]
+    issuer_breakdown: dict[str, dict[str, int]]
+    validity_start_years: dict[str, dict[str, int]]
+    san_type_counts: dict[str, int]
+    subject_cn_in_dns_san_count: int
+    subject_cn_not_in_dns_san_count: int
+    dual_eku_subject_cns_with_server_only_sibling: list[str]
+    dual_eku_subject_cns_without_server_only_sibling: list[str]
+
+

What this block is doing

The roll-up numbers that power the purpose chapter.

+

Flow arrows

Earlier blocks or operator input feed this block. → AssessmentSummary → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## utc_now_iso + + + + + + +
+
def utc_now_iso() -> str:
+    return datetime.now(UTC).isoformat(timespec="seconds").replace("+00:00", "Z")
+
+

What this block is doing

This function is one of the building blocks inside `ct_usage_assessment.py`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine.

+

Flow arrows

Earlier blocks or operator input feed this block. → utc_now_iso → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## parse_args + + + + + + +
+
def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Assess certificate intended usage from EKU and KeyUsage."
+    )
+    parser.add_argument(
+        "--domains-file",
+        type=Path,
+        default=Path("domains.local.txt"),
+        help="Configurable list of search domains, one per line.",
+    )
+    parser.add_argument(
+        "--cache-dir",
+        type=Path,
+        default=Path(".cache/ct-search"),
+        help="Directory used by ct_scan.py for cached CT results.",
+    )
+    parser.add_argument(
+        "--cache-ttl-seconds",
+        type=int,
+        default=86400,
+        help="Reuse cached CT results up to this age before refreshing from crt.sh.",
+    )
+    parser.add_argument(
+        "--max-candidates",
+        type=int,
+        default=10000,
+        help="Maximum raw crt.sh identity rows to inspect per configured domain.",
+    )
+    parser.add_argument(
+        "--attempts",
+        type=int,
+        default=3,
+        help="Retry attempts for live crt.sh database queries.",
+    )
+    parser.add_argument(
+        "--markdown-output",
+        type=Path,
+        default=Path("output/certificate-purpose-assessment.md"),
+        help="Human-readable assessment output.",
+    )
+    parser.add_argument(
+        "--json-output",
+        type=Path,
+        default=Path("output/certificate-purpose-assessment.json"),
+        help="Machine-readable assessment output.",
+    )
+    parser.add_argument(
+        "--verbose",
+        action="store_true",
+        help="Print refresh activity to stderr.",
+    )
+    return parser.parse_args()
+
+

What this block is doing

This block defines the command-line knobs for the file: input paths, cache settings, output paths, and other runtime switches.

+

Flow arrows

Earlier blocks or operator input feed this block. → parse_args → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## load_records + + + + + + +
+
def load_records(
+    domains: list[str],
+    cache_dir: Path,
+    cache_ttl_seconds: int,
+    max_candidates: int,
+    attempts: int,
+    verbose: bool,
+) -> list[ct_scan.DatabaseRecord]:
+    all_records: list[ct_scan.DatabaseRecord] = []
+    for domain in domains:
+        records = ct_scan.load_cached_records(cache_dir, domain, cache_ttl_seconds, max_candidates)
+        if records is None:
+            records = ct_scan.query_domain(domain, max_candidates=max_candidates, attempts=attempts, verbose=verbose)
+            ct_scan.store_cached_records(cache_dir, domain, max_candidates=max_candidates, records=records)
+        all_records.extend(records)
+    return all_records
+
+

What this block is doing

This block loads data from disk, cache, or an earlier stage so later code can work with it.

+

Flow arrows

Earlier blocks or operator input feed this block. → load_records → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## extract_eku_oids + + + + + + +
+
def extract_eku_oids(cert: x509.Certificate) -> list[str]:
+    try:
+        extension = cert.extensions.get_extension_for_oid(ExtensionOID.EXTENDED_KEY_USAGE)
+    except x509.ExtensionNotFound:
+        return []
+    return sorted(oid.dotted_string for oid in extension.value)
+
+

What this block is doing

This block pulls one specific piece of information out of a larger object.

+

Flow arrows

One certificate object. → extract_eku_oids → `classify_purpose` uses these OIDs to decide the category.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## extract_key_usage_flags + + + + + + +
+
def extract_key_usage_flags(cert: x509.Certificate) -> list[str]:
+    try:
+        key_usage = cert.extensions.get_extension_for_oid(ExtensionOID.KEY_USAGE).value
+    except x509.ExtensionNotFound:
+        return []
+    flags: list[str] = []
+    for attribute in (
+        "digital_signature",
+        "content_commitment",
+        "key_encipherment",
+        "data_encipherment",
+        "key_agreement",
+        "key_cert_sign",
+        "crl_sign",
+    ):
+        if getattr(key_usage, attribute):
+            flags.append(attribute)
+    if key_usage.key_agreement:
+        if key_usage.encipher_only:
+            flags.append("encipher_only")
+        if key_usage.decipher_only:
+            flags.append("decipher_only")
+    return flags
+
+

What this block is doing

This block pulls one specific piece of information out of a larger object.

+

Flow arrows

One certificate object. → extract_key_usage_flags → `build_classifications` stores these flags as supporting evidence.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## classify_purpose + + + + + + +
+
def classify_purpose(eku_oids: list[str]) -> str:
+    eku_set = set(eku_oids)
+    has_server = SERVER_AUTH_OID in eku_set or ANY_EXTENDED_KEY_USAGE_OID in eku_set
+    has_client = CLIENT_AUTH_OID in eku_set or ANY_EXTENDED_KEY_USAGE_OID in eku_set
+    has_code_signing = CODE_SIGNING_OID in eku_set
+    has_email = EMAIL_PROTECTION_OID in eku_set
+
+    if not eku_oids:
+        return "no_eku"
+    if has_server and not has_client and not has_code_signing and not has_email:
+        return "tls_server_only"
+    if has_server and has_client and not has_code_signing and not has_email:
+        return "tls_server_and_client"
+    if has_client and not has_server and not has_code_signing and not has_email:
+        return "client_auth_only"
+    if has_email and not has_server and not has_client and not has_code_signing:
+        return "smime_only"
+    if has_code_signing and not has_server and not has_client and not has_email:
+        return "code_signing_only"
+    return "mixed_or_other"
+
+

What this block is doing

This block applies rules and chooses a category label.

+

Flow arrows

The EKU OID list from one certificate. → classify_purpose → `build_classifications` turns that decision into a per-certificate record.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## format_eku_template + + + + + + +
+
def format_eku_template(eku_oids: list[str]) -> str:
+    if not eku_oids:
+        return "(none)"
+    return ", ".join(EKU_LABELS.get(oid, oid) for oid in eku_oids)
+
+

What this block is doing

This function is one of the building blocks inside `ct_usage_assessment.py`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine.

+

Flow arrows

Earlier blocks or operator input feed this block. → format_eku_template → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## format_key_usage_template + + + + + + +
+
def format_key_usage_template(flags: list[str]) -> str:
+    if not flags:
+        return "(missing)"
+    return ", ".join(flags)
+
+

What this block is doing

This function is one of the building blocks inside `ct_usage_assessment.py`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine.

+

Flow arrows

Earlier blocks or operator input feed this block. → format_key_usage_template → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## build_classifications + + + + + + +
+
def build_classifications(
+    hits: list[ct_scan.CertificateHit],
+    records: list[ct_scan.DatabaseRecord],
+) -> list[PurposeClassification]:
+    certificates_by_fingerprint: dict[str, x509.Certificate] = {}
+    for record in records:
+        cert = x509.load_der_x509_certificate(record.certificate_der)
+        is_leaf, _reason = ct_scan.is_leaf_certificate(cert)
+        if not is_leaf:
+            continue
+        fingerprint_sha256 = hashlib.sha256(record.certificate_der).hexdigest()
+        certificates_by_fingerprint.setdefault(fingerprint_sha256, cert)
+
+    results: list[PurposeClassification] = []
+    for hit in hits:
+        cert = certificates_by_fingerprint[hit.fingerprint_sha256]
+        san_dns_names = sorted(entry[4:] for entry in hit.san_entries if entry.startswith("DNS:"))
+        results.append(
+            PurposeClassification(
+                fingerprint_sha256=hit.fingerprint_sha256,
+                subject_cn=hit.subject_cn,
+                issuer_name=ct_scan.primary_issuer_name(hit),
+                category=classify_purpose(extract_eku_oids(cert)),
+                eku_oids=extract_eku_oids(cert),
+                key_usage_flags=extract_key_usage_flags(cert),
+                valid_from_utc=ct_scan.utc_iso(hit.validity_not_before),
+                valid_to_utc=ct_scan.utc_iso(hit.validity_not_after),
+                matched_domains=sorted(hit.matched_domains),
+                san_dns_names=san_dns_names,
+            )
+        )
+    results.sort(
+        key=lambda item: (
+            item.category,
+            item.subject_cn.casefold(),
+            item.valid_from_utc,
+            item.fingerprint_sha256,
+        )
+    )
+    return results
+
+

What this block is doing

Walks through all current certificates and labels them by intended usage.

+

Flow arrows

The cleaned current hits plus raw records. → build_classifications → `summarize` compresses these rows into report-level counts.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## summarize + + + + + + +
+
def summarize(classifications: list[PurposeClassification], domains: list[str]) -> AssessmentSummary:
+    category_counts = Counter(item.category for item in classifications)
+    eku_templates = Counter(format_eku_template(item.eku_oids) for item in classifications)
+    key_usage_templates = Counter(format_key_usage_template(item.key_usage_flags) for item in classifications)
+    issuer_breakdown: dict[str, Counter[str]] = defaultdict(Counter)
+    validity_start_years: dict[str, Counter[str]] = defaultdict(Counter)
+    san_type_counts: Counter[str] = Counter()
+    subject_cn_in_dns_san_count = 0
+    subject_cn_not_in_dns_san_count = 0
+    categories_by_canonical_cn: dict[str, set[str]] = defaultdict(set)
+
+    for item in classifications:
+        issuer_breakdown[item.category][item.issuer_name] += 1
+        validity_start_years[item.category][item.valid_from_utc[:4]] += 1
+        san_type_counts["DNSName"] += len(item.san_dns_names)
+        if item.subject_cn in set(item.san_dns_names):
+            subject_cn_in_dns_san_count += 1
+        else:
+            subject_cn_not_in_dns_san_count += 1
+        categories_by_canonical_cn[ct_scan.canonicalize_subject_cn(item.subject_cn)].add(item.category)
+
+    dual_with_server_only = sorted(
+        canonical_cn
+        for canonical_cn, values in categories_by_canonical_cn.items()
+        if "tls_server_and_client" in values and "tls_server_only" in values
+    )
+    dual_without_server_only = sorted(
+        canonical_cn
+        for canonical_cn, values in categories_by_canonical_cn.items()
+        if values == {"tls_server_and_client"}
+    )
+
+    return AssessmentSummary(
+        generated_at_utc=utc_now_iso(),
+        source_cache_domains=domains,
+        unique_leaf_certificates=len(classifications),
+        category_counts=dict(category_counts),
+        eku_templates=dict(eku_templates.most_common()),
+        key_usage_templates=dict(key_usage_templates.most_common()),
+        issuer_breakdown={category: dict(counter.most_common()) for category, counter in issuer_breakdown.items()},
+        validity_start_years={
+            category: dict(sorted(counter.items()))
+            for category, counter in validity_start_years.items()
+        },
+        san_type_counts=dict(san_type_counts),
+        subject_cn_in_dns_san_count=subject_cn_in_dns_san_count,
+        subject_cn_not_in_dns_san_count=subject_cn_not_in_dns_san_count,
+        dual_eku_subject_cns_with_server_only_sibling=dual_with_server_only,
+        dual_eku_subject_cns_without_server_only_sibling=dual_without_server_only,
+    )
+
+

What this block is doing

Compresses the per-certificate labels into counts, templates, and issuer breakdowns.

+

Flow arrows

The per-certificate purpose labels. → summarize → Current-state and monograph chapters use the summary counts and templates.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## render_markdown + + + + + + +
+
def render_markdown(summary: AssessmentSummary, classifications: list[PurposeClassification]) -> str:
+    lines: list[str] = []
+    lines.append("# Certificate Purpose Assessment")
+    lines.append("")
+    lines.append(f"Generated at: `{summary.generated_at_utc}`")
+    lines.append(f"Configured domains: `{', '.join(summary.source_cache_domains)}`")
+    lines.append("")
+    lines.append("## Headline Verdict")
+    lines.append("")
+    lines.append(f"- Unique current leaf certificates assessed: **{summary.unique_leaf_certificates}**")
+    lines.append(f"- TLS server only: **{summary.category_counts.get('tls_server_only', 0)}**")
+    lines.append(f"- TLS server and client auth: **{summary.category_counts.get('tls_server_and_client', 0)}**")
+    lines.append(f"- Client auth only: **{summary.category_counts.get('client_auth_only', 0)}**")
+    lines.append(f"- S/MIME only: **{summary.category_counts.get('smime_only', 0)}**")
+    lines.append(f"- Code signing only: **{summary.category_counts.get('code_signing_only', 0)}**")
+    lines.append(f"- Mixed or other: **{summary.category_counts.get('mixed_or_other', 0)}**")
+    lines.append(f"- No EKU: **{summary.category_counts.get('no_eku', 0)}**")
+    lines.append("")
+    lines.append("## What This Means")
+    lines.append("")
+    lines.append("- The corpus contains **only TLS-capable certificates**. There are no client-only, S/MIME, or code-signing certificates.")
+    lines.append("- All SAN entries seen in this corpus are DNS names.")
+    lines.append(f"- Subject CN appears literally in a DNS SAN for **{summary.subject_cn_in_dns_san_count} of {summary.unique_leaf_certificates}** certificates.")
+    lines.append("- The only ambiguity is whether to keep or set aside the certificates whose EKU allows both `serverAuth` and `clientAuth`.")
+    lines.append("")
+    lines.append("## Rework Options")
+    lines.append("")
+    lines.append(f"- Keep the full operational server corpus: **{summary.unique_leaf_certificates}** certificates.")
+    lines.append(f"- Keep only strict server-auth certificates: **{summary.category_counts.get('tls_server_only', 0)}** certificates.")
+    lines.append(f"- Create a review bucket for dual-EKU certificates: **{summary.category_counts.get('tls_server_and_client', 0)}** certificates.")
+    lines.append("")
+    lines.append("## EKU Templates")
+    lines.append("")
+    for template, count in summary.eku_templates.items():
+        lines.append(f"- `{template}`: {count}")
+    lines.append("")
+    lines.append("## KeyUsage Templates")
+    lines.append("")
+    for template, count in summary.key_usage_templates.items():
+        lines.append(f"- `{template}`: {count}")
+    lines.append("")
+    lines.append("## Issuer Breakdown")
+    lines.append("")
+    for category in sorted(summary.issuer_breakdown):
+        lines.append(f"### `{category}`")
+        lines.append("")
+        for issuer_name, count in summary.issuer_breakdown[category].items():
+            lines.append(f"- `{issuer_name}`: {count}")
+        lines.append("")
+    lines.append("## Time Pattern")
+    lines.append("")
+    dual_years = set(summary.validity_start_years.get("tls_server_and_client", {}))
+    server_years = set(summary.validity_start_years.get("tls_server_only", {}))
+    if dual_years and len(dual_years) == 1:
+        lines.append(
+            f"- The dual-EKU bucket is entirely composed of certificates whose current validity starts in **{next(iter(sorted(dual_years)))}**."
+        )
+    if dual_years and server_years and dual_years != server_years:
+        lines.append("- The year split suggests at least some change in issuance policy over time.")
+    else:
+        lines.append("- Time alone does not prove a migration. The stronger signal is the template split by issuer and EKU.")
+    lines.append("")
+    for category in sorted(summary.validity_start_years):
+        year_counts = ", ".join(f"{year}: {count}" for year, count in summary.validity_start_years[category].items())
+        lines.append(f"- `{category}`: {year_counts}")
+    lines.append("")
+    lines.append("## Interpretation")
+    lines.append("")
+    lines.append("- The `tls_server_and_client` certificates still look like hostname certificates, not user or robot identity certificates.")
+    lines.append("- Evidence: public DNS-style Subject CNs, DNS-only SANs, public WebPKI server-auth issuers, and no email or personal-name SAN material.")
+    lines.append("- The most plausible reading is **legacy or permissive server certificate templates** that also included `clientAuth`, not a separate client-certificate estate.")
+    lines.append("")
+    lines.append("## Dual-EKU Hostname Overlap")
+    lines.append("")
+    lines.append(
+        f"- Dual-EKU subject CN families that also have a strict server-only sibling: **{len(summary.dual_eku_subject_cns_with_server_only_sibling)}**"
+    )
+    lines.append(
+        f"- Dual-EKU subject CN families that currently appear only in the dual-EKU bucket: **{len(summary.dual_eku_subject_cns_without_server_only_sibling)}**"
+    )
+    lines.append("")
+    if summary.dual_eku_subject_cns_with_server_only_sibling:
+        lines.append("### Dual-EKU Families With Server-Only Siblings")
+        lines.append("")
+        for subject_cn in summary.dual_eku_subject_cns_with_server_only_sibling:
+            lines.append(f"- `{subject_cn}`")
+        lines.append("")
+    if summary.dual_eku_subject_cns_without_server_only_sibling:
+        lines.append("### Dual-EKU Families Without Server-Only Siblings")
+        lines.append("")
+        for subject_cn in summary.dual_eku_subject_cns_without_server_only_sibling:
+            lines.append(f"- `{subject_cn}`")
+        lines.append("")
+    lines.append("## Detailed Dual-EKU Certificates")
+    lines.append("")
+    dual_items = [item for item in classifications if item.category == "tls_server_and_client"]
+    if not dual_items:
+        lines.append("- None")
+        lines.append("")
+    else:
+        for item in dual_items:
+            dns_sample = ", ".join(item.san_dns_names[:8])
+            if len(item.san_dns_names) > 8:
+                dns_sample += ", ..."
+            lines.append(f"### `{item.subject_cn}`")
+            lines.append("")
+            lines.append(f"- Issuer: `{item.issuer_name}`")
+            lines.append(f"- Validity: `{item.valid_from_utc}` to `{item.valid_to_utc}`")
+            lines.append(f"- Matched search domains: `{', '.join(item.matched_domains)}`")
+            lines.append(f"- EKU: `{format_eku_template(item.eku_oids)}`")
+            lines.append(f"- KeyUsage: `{format_key_usage_template(item.key_usage_flags)}`")
+            lines.append(f"- DNS SAN count: `{len(item.san_dns_names)}`")
+            lines.append(f"- DNS SAN sample: `{dns_sample}`")
+            lines.append("")
+    return "\n".join(lines) + "\n"
+
+

What this block is doing

Writes the standalone purpose report.

+

Flow arrows

Earlier blocks or operator input feed this block. → render_markdown → Later blocks in the same file or in the next analytical stage consume its output.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+ +## main + + + + + + +
+
def main() -> int:
+    args = parse_args()
+    domains = ct_scan.load_domains(args.domains_file)
+    records = load_records(
+        domains=domains,
+        cache_dir=args.cache_dir,
+        cache_ttl_seconds=args.cache_ttl_seconds,
+        max_candidates=args.max_candidates,
+        attempts=args.attempts,
+        verbose=args.verbose,
+    )
+    hits, verification = ct_scan.build_hits(records)
+    classifications = build_classifications(hits, records)
+    summary = summarize(classifications, domains)
+
+    markdown_payload = render_markdown(summary, classifications)
+    json_payload = {
+        "summary": asdict(summary),
+        "verification": asdict(verification),
+        "classifications": [asdict(item) for item in classifications],
+    }
+
+    args.markdown_output.parent.mkdir(parents=True, exist_ok=True)
+    args.json_output.parent.mkdir(parents=True, exist_ok=True)
+    args.markdown_output.write_text(markdown_payload, encoding="utf-8")
+    args.json_output.write_text(json.dumps(json_payload, indent=2, sort_keys=True), encoding="utf-8")
+    return 0
+
+

What this block is doing

The standalone command-line entrypoint for the purpose analyzer.

+

Flow arrows

CLI arguments from the operator. → main → Runs the standalone purpose analysis end to end.

+

How to think about it

Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?

+
+