Downgrade weak www signal in monograph

This commit is contained in:
saymrwulf 2026-03-30 13:51:35 +02:00
parent e65887c680
commit bd6c8688f8
3 changed files with 62 additions and 18 deletions

View file

@ -160,6 +160,11 @@ def dns_zone_count(hit: ct_scan.CertificateHit) -> int:
return len(zones)
def zone_root_label(name: str) -> str:
zone = ct_scan.san_tail_split(name)[1]
return zone.split(".")[0].lower()
def group_member_hits(groups: list[ct_scan.CertificateGroup], hits: list[ct_scan.CertificateHit]) -> dict[str, list[ct_scan.CertificateHit]]:
mapping: dict[str, list[ct_scan.CertificateHit]] = {}
for group in groups:
@ -266,18 +271,54 @@ def pick_examples(
)
)
www_hits = [hit for hit in hits if is_www_pair(hit)]
if www_hits:
hit = min(www_hits, key=lambda item: (item.subject_cn.count("."), item.subject_cn))
zone_tokens = sorted(
{
zone_root_label(hit.subject_cn)
for hit in hits
if "." in hit.subject_cn
}
| {
zone_root_label(entry[4:])
for hit in hits
for entry in hit.san_entries
if entry.startswith("DNS:")
}
)
splice_hits = []
for hit in hits:
if "." not in hit.subject_cn:
continue
leading_label = hit.subject_cn.split(".")[0].lower()
public_zone = ct_scan.san_tail_split(hit.subject_cn)[1]
public_zone_root = public_zone.split(".")[0].lower()
foreign_tokens = [token for token in zone_tokens if token != public_zone_root and token in leading_label]
if foreign_tokens:
splice_hits.append((hit, public_zone, foreign_tokens))
if splice_hits:
hit, public_zone, foreign_tokens = max(
splice_hits,
key=lambda item: (dns_zone_count(item[0]), len(item[0].san_entries), item[0].subject_cn),
)
middle_segment = hit.subject_cn.split(".")[1] if hit.subject_cn.count(".") >= 2 else ""
related = sorted(
{
other.subject_cn
for other in hits
if middle_segment and f".{middle_segment}." in other.subject_cn
and other.subject_cn != hit.subject_cn
and ct_scan.san_tail_split(other.subject_cn)[1] == public_zone
}
)
examples.append(
ExampleBlock(
title="Clean public front door",
title="Brand-platform splice",
subject_cn=hit.subject_cn,
why_it_matters="A two-name SAN pairing of the apex hostname with its www form is usually a deliberate customer-facing presentation rule rather than an internal platform rail.",
why_it_matters="When the left side of a hostname carries one business or platform label but the public zone belongs to another brand, that usually exposes migration residue or a shared platform being presented through a different public namespace.",
evidence=[
f"SAN entries: {', '.join(entry[4:] for entry in hit.san_entries if entry.startswith('DNS:'))}.",
f"Issuer: {sorted(hit.issuer_names)[0]}.",
f"Revocation status: {hit.revocation_status}.",
f"Subject CN mixes leading-label namespace tokens {', '.join(foreign_tokens[:3])} with the public zone {public_zone}: {hit.subject_cn}.",
f"Distinct DNS zones in SAN set: {dns_zone_count(hit)}.",
f"Representative sibling names in the same middle namespace: {', '.join(related[:5]) or 'none'}.",
f"SAN entries: {len(hit.san_entries)}.",
],
)
)

View file

@ -232,7 +232,7 @@ def example_pattern_label(title: str) -> str:
return {
"Shared operational rail": "Numbered fleet or operational-rail naming",
"Environment matrix certificate": "Environment-matrix and lifecycle naming",
"Clean public front door": "Public brand-entry naming",
"Brand-platform splice": "Cross-brand namespace and migration-residue naming",
"Cross-zone bridge": "Cross-zone bridge or shared-service naming",
}.get(title, "Naming pattern")
@ -617,9 +617,9 @@ def render_markdown(
lines.extend(
[
f"- Numbered CN families: {len(report['numbered_groups'])}.",
f"- Clean base-name plus `www` pairings: {report['public_www_pair_count']}.",
f"- Multi-zone SAN sets: {report['multi_zone_hit_count']}.",
f"- Frequent naming tokens: {', '.join(f'{token} ({count})' for token, count in report['top_env_tokens'][:8])}.",
"- The strongest naming signals come from numbered rails, environment markers, cross-brand labels, and cross-zone SAN composition. `www` is weak evidence either way.",
]
)
lines.append("")
@ -631,7 +631,7 @@ def render_markdown(
[
"- In most of these names, the left-most label tells you the endpoint role, node slot, or environment slice, while the zone on the right tells you which public namespace the service is answering under.",
"- Standard delivery shorthand appears throughout the corpus: `dev`, `qa`, `uat`, `sit`, `stg`, `preprod`, and `prod` are ordinary environment markers rather than mysterious product names.",
"- `www` usually means a public web presentation rule, not a platform rail.",
"- `www` is a weak signal both when present and when absent. Its presence often reflects compatibility, redirect history, or old web conventions; its absence does not imply any deeper architectural distinction.",
"- In this corpus, `nwg` reads as NatWest Group shorthand. Names like `rbs`, `natwest`, `ulsterbank`, `lombard`, `natwestpayments`, `coutts`, and `nwgwealth` are best read as parallel business or service namespaces within a wider shared estate, not as random unrelated domains.",
"- Some short forms remain inferential rather than provable. For example, `nft` clearly behaves like a non-production stage label, but Certificate Transparency alone cannot prove the local expansion used inside the company.",
]
@ -652,7 +652,7 @@ def render_markdown(
lines.append("")
lines.append("### Why These Four Examples")
lines.append("")
lines.append("Taken together, these four examples explain most of the naming behaviour in the corpus. The first shows platform fleet naming, the second shows environment-and-release naming, the third shows customer-facing brand presentation, and the fourth shows shared-service or migration bridging across several business namespaces.")
lines.append("Taken together, these four examples explain most of the naming behaviour in the corpus. The first shows platform fleet naming, the second shows environment-and-release naming, the third shows cross-brand namespace splicing and migration residue, and the fourth shows shared-service bridging across several business namespaces.")
lines.append("")
lines.append("## Chapter 6: DNS Delivery Architecture")
lines.append("")
@ -729,7 +729,7 @@ def render_markdown(
]
)
lines.append("")
lines.append("A useful way to read the corpus is to separate signal from noise. Repeated naming schemas are signal. Repeated DNS landing stacks are signal. Public trust lineage is signal. A one-off unusual label is usually noise unless it recurs across several certificates or lands on a distinctive platform.")
lines.append("A useful way to read the corpus is to separate signal from noise. Repeated naming schemas are signal. Repeated DNS landing patterns are signal. Public trust lineage is signal. Simple `www` presence or absence is weak evidence either way unless it coincides with stronger differences such as distinct DNS routing, distinct SAN composition, or a distinct certificate lineage.")
lines.append("")
lines.append("## Appendix A: Full Family Catalogue")
lines.append("")
@ -1319,9 +1319,9 @@ def render_latex(
add_summary(
[
f"Numbered CN families: {len(report['numbered_groups'])}.",
f"Clean base-name plus www pairings: {report['public_www_pair_count']}.",
f"Multi-zone SAN sets: {report['multi_zone_hit_count']}.",
f"Frequent naming tokens are {', '.join(f'{token} ({count})' for token, count in report['top_env_tokens'][:8])}.",
"The strongest naming signals come from numbered rails, environment markers, cross-brand labels, and cross-zone SAN composition. www is weak evidence either way.",
]
)
lines.append(
@ -1333,7 +1333,7 @@ def render_latex(
r"\begin{itemize}[leftmargin=1.4em]",
r"\item In most of these names, the left-most label tells you the endpoint role, node slot, or environment slice, while the zone on the right tells you which public namespace the service is answering under.",
r"\item Standard delivery shorthand appears throughout the corpus: \texttt{dev}, \texttt{qa}, \texttt{uat}, \texttt{sit}, \texttt{stg}, \texttt{preprod}, and \texttt{prod} are ordinary environment markers rather than mysterious product names.",
r"\item \texttt{www} usually means a public web presentation rule, not a platform rail.",
r"\item \texttt{www} is a weak signal both when present and when absent. Its presence often reflects compatibility, redirect history, or old web conventions; its absence does not imply any deeper architectural distinction.",
r"\item In this corpus, \texttt{nwg} reads as NatWest Group shorthand. Names like \texttt{rbs}, \texttt{natwest}, \texttt{ulsterbank}, \texttt{lombard}, \texttt{natwestpayments}, \texttt{coutts}, and \texttt{nwgwealth} are best read as parallel business or service namespaces within a wider shared estate, not as random unrelated domains.",
r"\item Some short forms remain inferential rather than provable. For example, \texttt{nft} clearly behaves like a non-production stage label, but Certificate Transparency alone cannot prove the local expansion used inside the company.",
r"\end{itemize}",
@ -1356,7 +1356,7 @@ def render_latex(
lines.extend(
[
r"\subsection{Why These Four Examples}",
r"Taken together, these four examples explain most of the naming behaviour in the corpus. The first shows platform fleet naming, the second shows environment-and-release naming, the third shows customer-facing brand presentation, and the fourth shows shared-service or migration bridging across several business namespaces.",
r"Taken together, these four examples explain most of the naming behaviour in the corpus. The first shows platform fleet naming, the second shows environment-and-release naming, the third shows cross-brand namespace splicing and migration residue, and the fourth shows shared-service bridging across several business namespaces.",
]
)
@ -1439,6 +1439,9 @@ def render_latex(
"A public NXDOMAIN today does not automatically contradict a valid certificate because DNS and certificate lifecycles move on different clocks.",
]
)
lines.append(
r"A useful way to read the corpus is to separate signal from noise. Repeated naming schemas are signal. Repeated DNS landing patterns are signal. Public trust lineage is signal. Simple \texttt{www} presence or absence is weak evidence either way unless it coincides with stronger differences such as distinct DNS routing, distinct SAN composition, or a distinct certificate lineage."
)
lines.extend(
[

View file

@ -678,7 +678,7 @@ def describe_group_basis(group: CertificateGroup) -> str:
pattern = next(iter(group.numbered_cn_patterns))
return f"CN pattern with running-number slot: `{pattern}`"
base = min(canonicalize_subject_cn(value) for value in group.subject_cns)
return f"Same endpoint CN family (exact CN, with `www.` folded): `{base}`"
return f"Same endpoint CN family (exact CN; `www.` grouped with base name): `{base}`"
def primary_issuer_name(hit: CertificateHit) -> str:
@ -886,7 +886,7 @@ def render_markdown_report(
lines.append("")
lines.append("- Chapters are built from Subject CN construction only.")
lines.append("- If multiple concrete CNs share the same numbered schema, they are grouped together.")
lines.append("- Otherwise the chapter is one endpoint family, with `www.` folded into the same base endpoint.")
lines.append("- Otherwise the chapter is one endpoint family; `www.` is grouped with the base name as a low-signal convenience.")
lines.append("- SAN entries are shown only inside each Subject CN subsection.")
lines.append("- All certificates shown here are verified leaf certificates.")
lines.append("")