mirror of
https://github.com/saymrwulf/CertTransparencySearch.git
synced 2026-05-14 20:37:52 +00:00
Downgrade weak www signal in monograph
This commit is contained in:
parent
e65887c680
commit
bd6c8688f8
3 changed files with 62 additions and 18 deletions
|
|
@ -160,6 +160,11 @@ def dns_zone_count(hit: ct_scan.CertificateHit) -> int:
|
|||
return len(zones)
|
||||
|
||||
|
||||
def zone_root_label(name: str) -> str:
|
||||
zone = ct_scan.san_tail_split(name)[1]
|
||||
return zone.split(".")[0].lower()
|
||||
|
||||
|
||||
def group_member_hits(groups: list[ct_scan.CertificateGroup], hits: list[ct_scan.CertificateHit]) -> dict[str, list[ct_scan.CertificateHit]]:
|
||||
mapping: dict[str, list[ct_scan.CertificateHit]] = {}
|
||||
for group in groups:
|
||||
|
|
@ -266,18 +271,54 @@ def pick_examples(
|
|||
)
|
||||
)
|
||||
|
||||
www_hits = [hit for hit in hits if is_www_pair(hit)]
|
||||
if www_hits:
|
||||
hit = min(www_hits, key=lambda item: (item.subject_cn.count("."), item.subject_cn))
|
||||
zone_tokens = sorted(
|
||||
{
|
||||
zone_root_label(hit.subject_cn)
|
||||
for hit in hits
|
||||
if "." in hit.subject_cn
|
||||
}
|
||||
| {
|
||||
zone_root_label(entry[4:])
|
||||
for hit in hits
|
||||
for entry in hit.san_entries
|
||||
if entry.startswith("DNS:")
|
||||
}
|
||||
)
|
||||
splice_hits = []
|
||||
for hit in hits:
|
||||
if "." not in hit.subject_cn:
|
||||
continue
|
||||
leading_label = hit.subject_cn.split(".")[0].lower()
|
||||
public_zone = ct_scan.san_tail_split(hit.subject_cn)[1]
|
||||
public_zone_root = public_zone.split(".")[0].lower()
|
||||
foreign_tokens = [token for token in zone_tokens if token != public_zone_root and token in leading_label]
|
||||
if foreign_tokens:
|
||||
splice_hits.append((hit, public_zone, foreign_tokens))
|
||||
if splice_hits:
|
||||
hit, public_zone, foreign_tokens = max(
|
||||
splice_hits,
|
||||
key=lambda item: (dns_zone_count(item[0]), len(item[0].san_entries), item[0].subject_cn),
|
||||
)
|
||||
middle_segment = hit.subject_cn.split(".")[1] if hit.subject_cn.count(".") >= 2 else ""
|
||||
related = sorted(
|
||||
{
|
||||
other.subject_cn
|
||||
for other in hits
|
||||
if middle_segment and f".{middle_segment}." in other.subject_cn
|
||||
and other.subject_cn != hit.subject_cn
|
||||
and ct_scan.san_tail_split(other.subject_cn)[1] == public_zone
|
||||
}
|
||||
)
|
||||
examples.append(
|
||||
ExampleBlock(
|
||||
title="Clean public front door",
|
||||
title="Brand-platform splice",
|
||||
subject_cn=hit.subject_cn,
|
||||
why_it_matters="A two-name SAN pairing of the apex hostname with its www form is usually a deliberate customer-facing presentation rule rather than an internal platform rail.",
|
||||
why_it_matters="When the left side of a hostname carries one business or platform label but the public zone belongs to another brand, that usually exposes migration residue or a shared platform being presented through a different public namespace.",
|
||||
evidence=[
|
||||
f"SAN entries: {', '.join(entry[4:] for entry in hit.san_entries if entry.startswith('DNS:'))}.",
|
||||
f"Issuer: {sorted(hit.issuer_names)[0]}.",
|
||||
f"Revocation status: {hit.revocation_status}.",
|
||||
f"Subject CN mixes leading-label namespace tokens {', '.join(foreign_tokens[:3])} with the public zone {public_zone}: {hit.subject_cn}.",
|
||||
f"Distinct DNS zones in SAN set: {dns_zone_count(hit)}.",
|
||||
f"Representative sibling names in the same middle namespace: {', '.join(related[:5]) or 'none'}.",
|
||||
f"SAN entries: {len(hit.san_entries)}.",
|
||||
],
|
||||
)
|
||||
)
|
||||
|
|
|
|||
|
|
@ -232,7 +232,7 @@ def example_pattern_label(title: str) -> str:
|
|||
return {
|
||||
"Shared operational rail": "Numbered fleet or operational-rail naming",
|
||||
"Environment matrix certificate": "Environment-matrix and lifecycle naming",
|
||||
"Clean public front door": "Public brand-entry naming",
|
||||
"Brand-platform splice": "Cross-brand namespace and migration-residue naming",
|
||||
"Cross-zone bridge": "Cross-zone bridge or shared-service naming",
|
||||
}.get(title, "Naming pattern")
|
||||
|
||||
|
|
@ -617,9 +617,9 @@ def render_markdown(
|
|||
lines.extend(
|
||||
[
|
||||
f"- Numbered CN families: {len(report['numbered_groups'])}.",
|
||||
f"- Clean base-name plus `www` pairings: {report['public_www_pair_count']}.",
|
||||
f"- Multi-zone SAN sets: {report['multi_zone_hit_count']}.",
|
||||
f"- Frequent naming tokens: {', '.join(f'{token} ({count})' for token, count in report['top_env_tokens'][:8])}.",
|
||||
"- The strongest naming signals come from numbered rails, environment markers, cross-brand labels, and cross-zone SAN composition. `www` is weak evidence either way.",
|
||||
]
|
||||
)
|
||||
lines.append("")
|
||||
|
|
@ -631,7 +631,7 @@ def render_markdown(
|
|||
[
|
||||
"- In most of these names, the left-most label tells you the endpoint role, node slot, or environment slice, while the zone on the right tells you which public namespace the service is answering under.",
|
||||
"- Standard delivery shorthand appears throughout the corpus: `dev`, `qa`, `uat`, `sit`, `stg`, `preprod`, and `prod` are ordinary environment markers rather than mysterious product names.",
|
||||
"- `www` usually means a public web presentation rule, not a platform rail.",
|
||||
"- `www` is a weak signal both when present and when absent. Its presence often reflects compatibility, redirect history, or old web conventions; its absence does not imply any deeper architectural distinction.",
|
||||
"- In this corpus, `nwg` reads as NatWest Group shorthand. Names like `rbs`, `natwest`, `ulsterbank`, `lombard`, `natwestpayments`, `coutts`, and `nwgwealth` are best read as parallel business or service namespaces within a wider shared estate, not as random unrelated domains.",
|
||||
"- Some short forms remain inferential rather than provable. For example, `nft` clearly behaves like a non-production stage label, but Certificate Transparency alone cannot prove the local expansion used inside the company.",
|
||||
]
|
||||
|
|
@ -652,7 +652,7 @@ def render_markdown(
|
|||
lines.append("")
|
||||
lines.append("### Why These Four Examples")
|
||||
lines.append("")
|
||||
lines.append("Taken together, these four examples explain most of the naming behaviour in the corpus. The first shows platform fleet naming, the second shows environment-and-release naming, the third shows customer-facing brand presentation, and the fourth shows shared-service or migration bridging across several business namespaces.")
|
||||
lines.append("Taken together, these four examples explain most of the naming behaviour in the corpus. The first shows platform fleet naming, the second shows environment-and-release naming, the third shows cross-brand namespace splicing and migration residue, and the fourth shows shared-service bridging across several business namespaces.")
|
||||
lines.append("")
|
||||
lines.append("## Chapter 6: DNS Delivery Architecture")
|
||||
lines.append("")
|
||||
|
|
@ -729,7 +729,7 @@ def render_markdown(
|
|||
]
|
||||
)
|
||||
lines.append("")
|
||||
lines.append("A useful way to read the corpus is to separate signal from noise. Repeated naming schemas are signal. Repeated DNS landing stacks are signal. Public trust lineage is signal. A one-off unusual label is usually noise unless it recurs across several certificates or lands on a distinctive platform.")
|
||||
lines.append("A useful way to read the corpus is to separate signal from noise. Repeated naming schemas are signal. Repeated DNS landing patterns are signal. Public trust lineage is signal. Simple `www` presence or absence is weak evidence either way unless it coincides with stronger differences such as distinct DNS routing, distinct SAN composition, or a distinct certificate lineage.")
|
||||
lines.append("")
|
||||
lines.append("## Appendix A: Full Family Catalogue")
|
||||
lines.append("")
|
||||
|
|
@ -1319,9 +1319,9 @@ def render_latex(
|
|||
add_summary(
|
||||
[
|
||||
f"Numbered CN families: {len(report['numbered_groups'])}.",
|
||||
f"Clean base-name plus www pairings: {report['public_www_pair_count']}.",
|
||||
f"Multi-zone SAN sets: {report['multi_zone_hit_count']}.",
|
||||
f"Frequent naming tokens are {', '.join(f'{token} ({count})' for token, count in report['top_env_tokens'][:8])}.",
|
||||
"The strongest naming signals come from numbered rails, environment markers, cross-brand labels, and cross-zone SAN composition. www is weak evidence either way.",
|
||||
]
|
||||
)
|
||||
lines.append(
|
||||
|
|
@ -1333,7 +1333,7 @@ def render_latex(
|
|||
r"\begin{itemize}[leftmargin=1.4em]",
|
||||
r"\item In most of these names, the left-most label tells you the endpoint role, node slot, or environment slice, while the zone on the right tells you which public namespace the service is answering under.",
|
||||
r"\item Standard delivery shorthand appears throughout the corpus: \texttt{dev}, \texttt{qa}, \texttt{uat}, \texttt{sit}, \texttt{stg}, \texttt{preprod}, and \texttt{prod} are ordinary environment markers rather than mysterious product names.",
|
||||
r"\item \texttt{www} usually means a public web presentation rule, not a platform rail.",
|
||||
r"\item \texttt{www} is a weak signal both when present and when absent. Its presence often reflects compatibility, redirect history, or old web conventions; its absence does not imply any deeper architectural distinction.",
|
||||
r"\item In this corpus, \texttt{nwg} reads as NatWest Group shorthand. Names like \texttt{rbs}, \texttt{natwest}, \texttt{ulsterbank}, \texttt{lombard}, \texttt{natwestpayments}, \texttt{coutts}, and \texttt{nwgwealth} are best read as parallel business or service namespaces within a wider shared estate, not as random unrelated domains.",
|
||||
r"\item Some short forms remain inferential rather than provable. For example, \texttt{nft} clearly behaves like a non-production stage label, but Certificate Transparency alone cannot prove the local expansion used inside the company.",
|
||||
r"\end{itemize}",
|
||||
|
|
@ -1356,7 +1356,7 @@ def render_latex(
|
|||
lines.extend(
|
||||
[
|
||||
r"\subsection{Why These Four Examples}",
|
||||
r"Taken together, these four examples explain most of the naming behaviour in the corpus. The first shows platform fleet naming, the second shows environment-and-release naming, the third shows customer-facing brand presentation, and the fourth shows shared-service or migration bridging across several business namespaces.",
|
||||
r"Taken together, these four examples explain most of the naming behaviour in the corpus. The first shows platform fleet naming, the second shows environment-and-release naming, the third shows cross-brand namespace splicing and migration residue, and the fourth shows shared-service bridging across several business namespaces.",
|
||||
]
|
||||
)
|
||||
|
||||
|
|
@ -1439,6 +1439,9 @@ def render_latex(
|
|||
"A public NXDOMAIN today does not automatically contradict a valid certificate because DNS and certificate lifecycles move on different clocks.",
|
||||
]
|
||||
)
|
||||
lines.append(
|
||||
r"A useful way to read the corpus is to separate signal from noise. Repeated naming schemas are signal. Repeated DNS landing patterns are signal. Public trust lineage is signal. Simple \texttt{www} presence or absence is weak evidence either way unless it coincides with stronger differences such as distinct DNS routing, distinct SAN composition, or a distinct certificate lineage."
|
||||
)
|
||||
|
||||
lines.extend(
|
||||
[
|
||||
|
|
|
|||
|
|
@ -678,7 +678,7 @@ def describe_group_basis(group: CertificateGroup) -> str:
|
|||
pattern = next(iter(group.numbered_cn_patterns))
|
||||
return f"CN pattern with running-number slot: `{pattern}`"
|
||||
base = min(canonicalize_subject_cn(value) for value in group.subject_cns)
|
||||
return f"Same endpoint CN family (exact CN, with `www.` folded): `{base}`"
|
||||
return f"Same endpoint CN family (exact CN; `www.` grouped with base name): `{base}`"
|
||||
|
||||
|
||||
def primary_issuer_name(hit: CertificateHit) -> str:
|
||||
|
|
@ -886,7 +886,7 @@ def render_markdown_report(
|
|||
lines.append("")
|
||||
lines.append("- Chapters are built from Subject CN construction only.")
|
||||
lines.append("- If multiple concrete CNs share the same numbered schema, they are grouped together.")
|
||||
lines.append("- Otherwise the chapter is one endpoint family, with `www.` folded into the same base endpoint.")
|
||||
lines.append("- Otherwise the chapter is one endpoint family; `www.` is grouped with the base name as a low-signal convenience.")
|
||||
lines.append("- SAN entries are shown only inside each Subject CN subsection.")
|
||||
lines.append("- All certificates shown here are verified leaf certificates.")
|
||||
lines.append("")
|
||||
|
|
|
|||
Loading…
Reference in a new issue