#!/usr/bin/env python3 from __future__ import annotations import ast import html from pathlib import Path ROOT = Path(__file__).resolve().parents[1] OUT_DIR = ROOT / "teachingNoobs" SOURCE_FILES = [ "ct_scan.py", "ct_dns_utils.py", "ct_usage_assessment.py", "ct_lineage_report.py", "ct_caa_analysis.py", "ct_focus_subjects.py", "ct_master_report.py", "ct_monograph_report.py", ] FILE_INTROS = { "ct_scan.py": ( "Core Certificate Transparency scanner. This file talks to crt.sh's public " "database, downloads the real certificate bytes, verifies that they are real " "leaf certificates, groups them into readable families, and can render the " "full inventory appendix." ), "ct_dns_utils.py": ( "Public DNS scanner. This file runs dig, follows alias chains, finds public " "addresses, and collapses raw DNS evidence into readable delivery labels." ), "ct_usage_assessment.py": ( "Certificate-purpose analyzer. This file looks at EKU and KeyUsage to decide " "what each certificate is technically allowed to do." ), "ct_lineage_report.py": ( "Historical analyzer. This file studies expired plus current certificates to " "find renewals, overlap, drift, and issuance bursts over time." ), "ct_caa_analysis.py": ( "CAA analyzer. This file resolves live DNS issuance policy and compares it " "against the public CA families that are actually covering the names today." ), "ct_focus_subjects.py": ( "Focused-cohort analyzer. This file takes your special hand-picked Subject CN " "list and compares it against the wider certificate and DNS estate." ), "ct_master_report.py": ( "Current-state synthesizer. This file combines certificate facts, DNS facts, " "purpose classification, grouping, and curated examples into one report bundle." ), "ct_monograph_report.py": ( "Publication builder. This file takes all analytical layers and turns them into " "the final monograph in Markdown, LaTeX, and PDF." ), } FILE_FLOW_STRIPS = { "ct_scan.py": "domains file -> raw CT query -> parsed leaf certificates -> CN families -> issuer trust -> appendix reports", "ct_dns_utils.py": "DNS name -> dig answers -> normalized observation -> provider hints -> delivery label", "ct_usage_assessment.py": "certificate bytes -> EKU and KeyUsage -> purpose label -> summary counts", "ct_lineage_report.py": "historical CT rows -> historical certificates -> grouped by Subject CN -> overlap and drift checks -> red flags", "ct_caa_analysis.py": "DNS name -> effective CAA lookup -> allowed CA families -> compare with live cert families", "ct_focus_subjects.py": "focus-subject file -> cohort entries -> compare against current and historical estate -> bucketed cohort explanation", "ct_master_report.py": "current CT facts + DNS facts + usage facts -> one current-state report bundle", "ct_monograph_report.py": "current-state bundle + history + CAA + focused cohort -> Markdown/LaTeX/PDF monograph", } BLOCK_NOTES = { "ct_scan.py": { "__module__": "Imports, SQL, constants, and shared data shapes for the core CT scanner.", "DatabaseRecord": "A raw row as it comes back from the crt.sh database before local cleanup.", "CertificateHit": "The cleaned working object used by the rest of the analytics pipeline.", "VerificationStats": "A tiny running counter that proves how many rows were kept or rejected.", "CertificateGroup": "One readable family of related certificates after grouping logic runs.", "ScanStats": "Top-level summary numbers used in reports.", "IssuerTrustInfo": "Stores the public-trust picture for one issuer family.", "connect": "Opens the direct guest PostgreSQL connection to crt.sh's certwatch backend.", "query_domain": "Runs the main certificate query for one search term and refuses silent undercounting.", "query_raw_match_count": "Counts how many raw hits exist before the capped query runs.", "build_hits": "Parses certificate bytes, rejects bad objects, and merges duplicate views of the same cert.", "build_groups": "Turns a flat certificate list into CN-based families such as exact endpoints or numbered rails.", "query_issuer_trust": "Checks which issuers are currently trusted for public TLS in the major WebPKI contexts.", "render_markdown_report": "Writes the raw inventory appendix as readable Markdown.", "render_latex_report": "Writes the raw inventory appendix as LaTeX for PDF assembly.", "compile_latex_to_pdf": "Hands LaTeX to XeLaTeX and turns it into a finished PDF file.", "main": "The standalone command-line entrypoint for the inventory scanner.", }, "ct_dns_utils.py": { "__module__": "Shared DNS scanning helpers, cache helpers, and the logic that turns raw DNS answers into platform clues.", "DnsObservation": "One complete DNS observation for one hostname.", "scan_name_live": "Runs the live DNS walk for one hostname.", "scan_name_cached": "Reuses a recent DNS result if possible, otherwise performs the live scan.", "infer_provider_hints": "Reads the raw DNS trail and pulls out likely platform or vendor clues.", "infer_stack_signature": "Collapses several low-level DNS clues into one human-readable delivery label.", "provider_explanations": "Supplies the glossary text used later in the reports.", }, "ct_usage_assessment.py": { "__module__": "Purpose-analysis constants and small data shapes for EKU and KeyUsage classification.", "PurposeClassification": "One certificate plus the usage label assigned to it.", "AssessmentSummary": "The roll-up numbers that power the purpose chapter.", "build_classifications": "Walks through all current certificates and labels them by intended usage.", "summarize": "Compresses the per-certificate labels into counts, templates, and issuer breakdowns.", "render_markdown": "Writes the standalone purpose report.", "main": "The standalone command-line entrypoint for the purpose analyzer.", }, "ct_lineage_report.py": { "__module__": "Historical query logic, data structures, and red-flag rules for certificate lifecycle analysis.", "HistoricalCertificate": "One certificate in the full time-based dataset, including expired ones.", "CnCollisionRow": "A table row for Subject-DN drift or issuer drift under the same Subject CN.", "SanChangeRow": "A table row that describes SAN-profile change for one Subject CN.", "OverlapRow": "A table row describing long predecessor/successor overlap.", "RedFlagRow": "A compact summary row for names worth attention.", "HistoricalAssessment": "The full historical analysis bundle used by the monograph.", "query_historical_domain": "Fetches the wider historical corpus for one search term.", "build_certificates": "Converts raw DB rows into historical working objects.", "dn_change_rows": "Finds names whose formal Subject DN changed over time.", "issuer_change_rows": "Finds names whose issuing CA family changed over time.", "san_change_rows": "Finds names whose SAN bundle changed over time.", "overlap_rows": "Finds predecessor/successor pairs that overlap too long.", "build_assessment": "Runs the full historical workflow and returns the finished analytical bundle.", "render_markdown": "Writes the standalone historical report in Markdown.", "render_latex": "Writes the standalone historical report in LaTeX.", "main": "The standalone command-line entrypoint for the historical analyzer.", }, "ct_caa_analysis.py": { "__module__": "Data structures and lookup logic for effective CAA policy analysis.", "CaaObservation": "One resolved CAA result before it is merged with certificate coverage data.", "CaaNameRow": "One final row that compares DNS policy with current live certificate families.", "CaaAnalysis": "The full CAA analysis bundle used by the monograph.", "relevant_caa_live": "Finds the effective live CAA for one name, including inheritance and alias behavior.", "build_analysis": "Runs CAA across the whole SAN namespace and compares policy with live issuance.", "rows_for_zone": "Filters the full analysis down to one configured DNS zone.", }, "ct_focus_subjects.py": { "__module__": "Rules and data shapes for analyzing the special hand-picked Subject-CN cohort.", "FocusSubject": "One line from the local focus-subject file.", "FocusSubjectDetail": "One detailed analytical row for one focused Subject CN.", "FocusCohortAnalysis": "The full cohort comparison bundle used in the monograph.", "load_focus_subjects": "Reads the local focus-subject list and any analyst notes attached to it.", "classify_taxonomy_bucket": "Places a name into the direct-front, platform-anchor, or ambiguous bucket.", "observed_role": "Tries to describe what role the name appears to play in the public estate.", "build_analysis": "Runs the full comparison between the focused cohort and the rest of the estate.", }, "ct_master_report.py": { "__module__": "Current-state report assembly code that sits above the low-level scanners.", "ExampleBlock": "A small narrative evidence block used in the naming chapter.", "load_records": "Loads current CT records for all configured search terms.", "enrich_dns": "Adds DNS observations and provider clues to the raw SAN-name list.", "pick_examples": "Chooses a few representative examples that make the naming and DNS story understandable.", "build_group_digest": "Builds a compact family catalogue used in reports.", "summarize_for_report": "Creates the big current-state dictionary consumed by the monograph builder.", "render_markdown": "Writes the shorter consolidated report in Markdown.", "render_latex": "Writes the shorter consolidated report in LaTeX.", "main": "The standalone command-line entrypoint for the consolidated current-state report.", }, "ct_monograph_report.py": { "__module__": "The orchestration and publishing layer that turns all analytical modules into one publication.", "render_appendix_inventory": "Generates the hidden full inventory appendix before the main monograph is assembled.", "append_longtable": "Shared LaTeX helper for readable multi-page tables.", "render_markdown": "Writes the narrative monograph in Markdown.", "render_latex": "Writes the narrative monograph in LaTeX.", "main": "The top-level command-line entrypoint for the complete monograph build.", }, } BLOCK_FLOWS = { "ct_scan.py": { "Module setup": ("Nothing yet; this is the starting point.", "`connect`, `query_domain`, `build_hits`, and the report renderers use these shared definitions."), "load_domains": ("Operator's local config file.", "`query_domain` and the higher-level loaders use this cleaned domain list."), "connect": ("Called by query functions that need live crt.sh data.", "`query_domain`, `query_raw_match_count`, and issuer-trust lookups all depend on this connection."), "query_raw_match_count": ("A domain string from the local config.", "`query_domain` uses this count to refuse silent undercounting."), "query_domain": ("A domain plus the safety cap and retry settings.", "`build_hits` receives the raw records returned here."), "build_hits": ("Raw `DatabaseRecord` rows from crt.sh.", "`build_groups`, purpose analysis, DNS analysis, and CAA analysis all consume these cleaned hits."), "build_groups": ("The flat list of `CertificateHit` objects.", "The report builders use these groups to turn raw certificate clutter into readable families."), "query_issuer_trust": ("The cleaned current certificate hits.", "Report builders use this trust view in the certificate chapters and appendix tables."), "render_markdown_report": ("Current hits, groups, and trust data.", "Produces the Markdown inventory appendix."), "render_latex_report": ("Current hits, groups, and trust data.", "Produces the LaTeX appendix source that later becomes PDF."), "compile_latex_to_pdf": ("A finished `.tex` file.", "Produces the human-readable PDF artifact."), "main": ("CLI arguments from the operator.", "Runs the whole scanner end to end."), }, "ct_dns_utils.py": { "Module setup": ("Nothing yet; this is the starting point.", "The later DNS helpers all reuse these imports and small shared helpers."), "run_dig": ("A hostname and record type.", "`scan_name_live`, `dig_status`, `dig_short`, and `ptr_lookup` all rely on this."), "scan_name_live": ("One DNS name from a SAN entry.", "`scan_name_cached` returns this result shape to higher-level analytics."), "scan_name_cached": ("A DNS name plus cache settings.", "`ct_master_report.enrich_dns` uses this for every SAN name in the current corpus."), "infer_provider_hints": ("One normalized DNS observation.", "`infer_stack_signature` and the report layers use the hints it produces."), "infer_stack_signature": ("One DNS observation plus provider clues.", "`ct_master_report` uses the resulting label in naming and DNS chapters."), "provider_explanations": ("The delivery labels used by the report.", "The monograph glossary uses these explanations directly."), }, "ct_usage_assessment.py": { "extract_eku_oids": ("One certificate object.", "`classify_purpose` uses these OIDs to decide the category."), "extract_key_usage_flags": ("One certificate object.", "`build_classifications` stores these flags as supporting evidence."), "classify_purpose": ("The EKU OID list from one certificate.", "`build_classifications` turns that decision into a per-certificate record."), "build_classifications": ("The cleaned current hits plus raw records.", "`summarize` compresses these rows into report-level counts."), "summarize": ("The per-certificate purpose labels.", "Current-state and monograph chapters use the summary counts and templates."), "main": ("CLI arguments from the operator.", "Runs the standalone purpose analysis end to end."), }, "ct_lineage_report.py": { "query_historical_domain": ("A configured search domain.", "`load_records` uses it to build the wider historical corpus."), "build_certificates": ("Historical `DatabaseRecord` rows.", "`group_by_subject_cn` and all drift checks consume these normalized historical certificates."), "group_by_subject_cn": ("Historical certificates.", "`dn_change_rows`, `issuer_change_rows`, `san_change_rows`, and `overlap_rows` all work off this grouping."), "dn_change_rows": ("CN-grouped historical certificates.", "`build_assessment` uses these rows for Subject-DN drift sections."), "issuer_change_rows": ("CN-grouped historical certificates.", "`build_assessment` uses these rows for CA-family drift sections."), "san_change_rows": ("CN-grouped historical certificates.", "`build_assessment` uses these rows for SAN-drift sections."), "overlap_rows": ("CN-grouped historical certificates.", "`build_assessment` turns these into current and past overlap red flags."), "build_assessment": ("Historical records from all configured domains.", "The monograph and standalone historical reports consume this one big bundle."), "main": ("CLI arguments from the operator.", "Runs the standalone historical analysis end to end."), }, "ct_caa_analysis.py": { "relevant_caa_live": ("One DNS name from the SAN universe.", "`build_analysis` uses this to learn the effective issuance policy per name."), "allowed_ca_families": ("Raw CAA rows for one effective policy.", "`build_analysis` uses the normalized families for policy-vs-live comparison."), "build_analysis": ("Current certificate hits and the configured zones.", "The monograph uses this for the CAA chapter and appendix."), "rows_for_zone": ("The full CAA analysis bundle.", "The monograph uses zone-filtered rows for per-zone policy tables."), }, "ct_focus_subjects.py": { "load_focus_subjects": ("The local focus-subject file.", "`build_analysis` uses these parsed cohort entries."), "classify_taxonomy_bucket": ("One focused Subject CN plus surrounding evidence.", "`build_analysis` uses the bucket label in the focused-cohort chapter."), "observed_role": ("One focused Subject CN plus public evidence.", "`build_analysis` stores the plain-English role description."), "build_analysis": ("The focus-subject list, current-state report, and historical assessment.", "The monograph uses the resulting bundle for Chapter 8 and Appendix D."), }, "ct_master_report.py": { "load_records": ("Configured domains from the local file.", "`summarize_for_report` uses the returned CT rows as its starting point."), "enrich_dns": ("The unique SAN DNS names from current hits.", "`summarize_for_report` uses the enriched observations for DNS chapters and examples."), "pick_examples": ("Current hits, groups, and DNS observations.", "`summarize_for_report` stores the chosen examples for the naming chapter."), "build_group_digest": ("Current groups plus DNS observations.", "Report builders use the digest in appendices and summary tables."), "summarize_for_report": ("Current CT rows, DNS observations, issuer trust, and usage facts.", "`ct_monograph_report.main` consumes this as the main current-state input."), "main": ("CLI arguments from the operator.", "Runs the shorter consolidated current-state report end to end."), }, "ct_monograph_report.py": { "render_appendix_inventory": ("The current-state report bundle.", "Creates the hidden appendix files that are later embedded into the monograph."), "render_markdown": ("Current-state facts, history, CAA, and focused-cohort analysis.", "Produces the main Markdown monograph."), "render_latex": ("Current-state facts, history, CAA, and focused-cohort analysis.", "Produces the main LaTeX monograph source."), "main": ("CLI arguments from the operator.", "Runs the full publication pipeline from raw analytics to finished PDF."), }, } CURRICULUM = """# teachingNoobs Curriculum Open each file in VS Code and use Markdown Preview. The intended order is: 1. [ct_scan.md](./ct_scan.md) Why first: this is the core analytics engine. If you understand this file, you understand where the certificate facts come from. 2. [ct_dns_utils.md](./ct_dns_utils.md) Why second: this explains how the DNS side was scanned and interpreted. 3. [ct_usage_assessment.md](./ct_usage_assessment.md) Why third: this explains how certificate purpose was classified from EKU and KeyUsage. 4. [ct_lineage_report.md](./ct_lineage_report.md) Why fourth: this adds historical time and red-flag logic. 5. [ct_caa_analysis.md](./ct_caa_analysis.md) Why fifth: this adds the DNS-side issuance-policy layer. 6. [ct_focus_subjects.md](./ct_focus_subjects.md) Why sixth: this explains the special hand-picked Subject-CN cohort logic. 7. [ct_master_report.md](./ct_master_report.md) Why seventh: this shows how the current-state analytical layers are stitched into one coherent bundle. 8. [ct_monograph_report.md](./ct_monograph_report.md) Why last: this is the publishing layer. Read it last because it is about presentation and assembly, not fact extraction. Suggested reading method: - Keep the Markdown preview open. - For each page, read the explanation on the right first. - Then look left at the code block and see how the explanation maps onto the exact lines. - Do not try to memorize every helper function on first pass. Focus on the few blocks that move real data from one stage to the next. - Pay special attention to the new `Flow arrows` panel on the right side. That panel tells you where the block's output goes next. What matters most: - In `ct_scan.py`: how raw database rows become verified leaf certificates. - In `ct_dns_utils.py`: how raw DNS answers become delivery clues. - In `ct_lineage_report.py`: how the code decides what is a normal renewal versus a red flag. - In `ct_caa_analysis.py`: how live DNS policy is compared with live certificate coverage. - In `ct_master_report.py`: how the current-state pieces are combined. What matters less on first read: - tiny formatting helpers - string-wrapping helpers - Markdown/LaTeX table plumbing Those are still useful, but they are support code, not the heart of the analytics. """ def block_span(node: ast.AST, next_node: ast.AST | None, total_lines: int) -> tuple[int, int]: start = min((item.lineno for item in getattr(node, "decorator_list", []) if hasattr(item, "lineno")), default=node.lineno) end = getattr(node, "end_lineno", None) or total_lines return start, end def fallback_explanation(file_name: str, block_name: str, kind: str) -> str: lower = block_name.lower() if kind == "class": return "This class is a structured container for one piece of data that later code passes around instead of juggling many loose variables." if lower == "parse_args": return "This block defines the command-line knobs for the file: input paths, cache settings, output paths, and other runtime switches." if lower == "main": return "This is the file's entrypoint. It glues the earlier helper blocks together into one end-to-end run." if lower.startswith("load_"): return "This block loads data from disk, cache, or an earlier stage so later code can work with it." if lower.startswith("store_"): return "This block saves an intermediate result so the next run can reuse it instead of recomputing everything." if lower.startswith("query_"): return "This block asks an external source for data and returns it in a shape the rest of the file can use." if lower.startswith("extract_"): return "This block pulls one specific piece of information out of a larger object." if lower.startswith("build_"): return "This block constructs a richer higher-level result from simpler inputs." if lower.startswith("render_"): return "This block turns structured analysis data into human-readable output." if lower.startswith("classify_"): return "This block applies rules and chooses a category label." if lower.startswith("summarize_") or lower == "summarize": return "This block compresses many detailed rows into a smaller, easier-to-read summary." if lower.startswith("compile_"): return "This block hands an intermediate artifact to an external tool so it becomes a finished output file." if lower.startswith("group_"): return "This block clusters related items together so later code can analyze them as families instead of as isolated rows." if lower.startswith("normalize_") or lower.startswith("canonicalize_"): return "This block makes values consistent so matching and grouping do not get confused by superficial differences." if lower.startswith("pct") or lower in {"utc_iso", "truncate_text", "first_list_item"}: return "This is a small helper that keeps the larger analytical code cleaner and easier to reuse." return f"This {kind} is one of the building blocks inside `{file_name}`. It exists so the file can do one narrow job at a time instead of one giant unreadable routine." def explain_block(file_name: str, block_name: str, kind: str) -> str: specific = BLOCK_NOTES.get(file_name, {}).get(block_name) if specific: return specific return fallback_explanation(file_name, block_name, kind) def code_panel(code: str, language: str = "python") -> str: escaped = html.escape(code.rstrip()) return ( '
'
        + escaped
        + "
" ) def explanation_panel(title: str, text: str) -> str: return ( f"

{html.escape(title)}

" f"

{html.escape(text)}

" ) def flow_panel(file_name: str, block_name: str) -> str: upstream, downstream = BLOCK_FLOWS.get(file_name, {}).get( block_name, ( "Earlier blocks or operator input feed this block.", "Later blocks in the same file or in the next analytical stage consume its output.", ), ) return ( "

Flow arrows

" f"

{html.escape(upstream)} → {html.escape(block_name)} → {html.escape(downstream)}

" ) def make_doc_for_file(file_name: str) -> str: path = ROOT / file_name source = path.read_text(encoding="utf-8") lines = source.splitlines() tree = ast.parse(source, filename=file_name) top_nodes = [node for node in tree.body if isinstance(node, (ast.ClassDef, ast.FunctionDef, ast.AsyncFunctionDef))] blocks: list[tuple[str, str, str]] = [] if top_nodes: first_start = min( (item.lineno for item in getattr(top_nodes[0], "decorator_list", []) if hasattr(item, "lineno")), default=top_nodes[0].lineno, ) preamble_end = first_start - 1 if preamble_end >= 1: preamble_code = "\n".join(lines[:preamble_end]).rstrip() if preamble_code: blocks.append(("Module setup", "module", preamble_code)) for index, node in enumerate(top_nodes): next_node = top_nodes[index + 1] if index + 1 < len(top_nodes) else None start, end = block_span(node, next_node, len(lines)) code = "\n".join(lines[start - 1 : end]).rstrip() kind = "class" if isinstance(node, ast.ClassDef) else "function" blocks.append((node.name, kind, code)) page_lines = [ f"# {file_name}", "", f"Source file: [`{file_name}`](../{file_name})", "", FILE_INTROS[file_name], "", f"Main flow in one line: `{FILE_FLOW_STRIPS[file_name]}`", "", "How to read this page:", "", "- left side: the actual source code block", "- right side: a plain-English explanation for a beginner", "- read from top to bottom because later blocks depend on earlier ones", "", ] for title, kind, code in blocks: explanation = explain_block(file_name, "__module__" if kind == "module" else title, kind) page_lines.extend( [ f"## {title}", "", '', "", '", '", "", "
', code_panel(code), "', explanation_panel("What this block is doing", explanation), flow_panel(file_name, title), explanation_panel( "How to think about it", "Treat this block as one small station in a pipeline. Ask: what comes in here, what gets changed here, and what comes out for the next block?", ), "
", "", ] ) return "\n".join(page_lines) + "\n" def main() -> int: OUT_DIR.mkdir(parents=True, exist_ok=True) for file_name in SOURCE_FILES: doc_path = OUT_DIR / file_name.replace(".py", ".md") doc_path.write_text(make_doc_for_file(file_name), encoding="utf-8") (OUT_DIR / "CURRICULUM.md").write_text(CURRICULUM, encoding="utf-8") return 0 if __name__ == "__main__": raise SystemExit(main())