feat: deterministic reCAPTCHA cookie pre-seed via Bayesian browsing history
Adds opt-in helper that auto-injects coherent cookie history into every
BrowserContext created via new_context(). Content is fully deterministic
from the persona seed so a given seed always presents the same cookies
across sessions.
Composition (per persona, all derived from seed):
- 5 cookies on .google.com (NID, CONSENT, SOCS, _GRECAPTCHA, ENID).
Excludes 1P_JAR which was deprecated by Google in 2022. CONSENT
`lang+region` token derived from the persona's IANA timezone
(Europe/Rome -> it+IT, America/* -> en+FX, etc.). NID prefix
broadened to 100-540 to cover historical versions.
- Per-site cookies on 13-25 "visited" everyday domains, sampled from a
Bayesian network conditioned on gpu_class - workstation/high_end
personas trend toward dev/tech sites, low_end/integrated_old trend
toward shop/news/reference. Each site contributes 1-7 cookies based
on a `cookie_profile` tag. Cookie pool includes _ga, _gid, _clck,
_clsk, __cf_bm, OneTrust/CookieYes consent, _fbp (Facebook Pixel),
_dc_gtm_<id> (Tag Manager helper), __hssrc (HubSpot helper).
API:
Stealthfox(seed=42, prep_recaptcha=True)
No per-call configuration: visited-sites + cookie composition all derived
from the persona seed via the Bayesian sampler.
Gated server-side: forced False if profile_dir is set (persistent profile
owns its own state). All expiries capped to 395 days per Chrome/Firefox
400-day RFC 6265bis-15 limit.
Bayesian integration:
- New `derive_browsing_history(gpu_class, rng)` in _fpforge/_sampler.py
(parallel to `derive_font_prefs`).
- New data files: browsing_pool.json (50 site entries) and
cpt_browsing_given_class.json (per-class probabilities).
- Profile dataclass exposes `browsing_history` field.
- _recaptcha_seed.py consumes Profile.browsing_history; receives
timezone separately to derive CONSENT lang+region.
Also drops a dead Chromium-only e2e test that always skipped on our
Firefox-only wrapper.
Test coverage: 29 unit tests covering composition, profile recipes
(minimal/ga_only/ga_cf/ga_consent/ga_consent_clarity), determinism,
Chrome 400-day cap, Playwright field requirements, CONSENT lang
mapping (IT/DE/US/default), helper-cookie probability distributions,
end-to-end with real fpforge Profile.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -84,6 +84,12 @@ _FONT_POOL = _load("font_pool.json")
|
|||||||
_FONT_CORE: list = _FONT_POOL["core"]
|
_FONT_CORE: list = _FONT_POOL["core"]
|
||||||
_FONT_OPTIONAL: list = _FONT_POOL["optional"]
|
_FONT_OPTIONAL: list = _FONT_POOL["optional"]
|
||||||
_CPT_FONTS_OPT = _load("cpt_fonts_optional_given_class.json")["table"]
|
_CPT_FONTS_OPT = _load("cpt_fonts_optional_given_class.json")["table"]
|
||||||
|
# Browsing-history pool + CPT (per-class probabilities for visited sites).
|
||||||
|
# Drives _recaptcha_seed's cookie pre-seed: each persona ends up with a
|
||||||
|
# coherent list of ~15-30 visited sites whose categories correlate with
|
||||||
|
# gpu_class (workstation → dev-heavy, integrated_old → shop+news-heavy).
|
||||||
|
_BROWSING_POOL: list = _load("browsing_pool.json")["entries"]
|
||||||
|
_CPT_BROWSING = _load("cpt_browsing_given_class.json")["table"]
|
||||||
|
|
||||||
|
|
||||||
# ═══════════════════════════════════════════════════════════════════════
|
# ═══════════════════════════════════════════════════════════════════════
|
||||||
@@ -282,6 +288,33 @@ def derive_font_whitelist(gpu_class: str, rng) -> str:
|
|||||||
return derive_font_prefs(gpu_class, rng)["whitelist"]
|
return derive_font_prefs(gpu_class, rng)["whitelist"]
|
||||||
|
|
||||||
|
|
||||||
|
# ═══════════════════════════════════════════════════════════════════════
|
||||||
|
# BROWSING HISTORY (Bayesian: per-site P(visited|gpu_class))
|
||||||
|
# ═══════════════════════════════════════════════════════════════════════
|
||||||
|
def derive_browsing_history(gpu_class: str, rng) -> list:
|
||||||
|
"""Sample which sites this persona has visited recently.
|
||||||
|
|
||||||
|
Each site in the pool has a per-class probability (CPT). We sample
|
||||||
|
independently per-site, producing a list of dicts:
|
||||||
|
[{"name": "github.com", "category": "dev", "cookie_profile": "ga_cf"}, ...]
|
||||||
|
|
||||||
|
Sum of CPT probabilities per class is tuned to land ~15-30 visited sites
|
||||||
|
on average — an established-user signature. Sorted by name for stable
|
||||||
|
output across runs of the same seed.
|
||||||
|
"""
|
||||||
|
cpt = _CPT_BROWSING.get(gpu_class)
|
||||||
|
if cpt is None:
|
||||||
|
cpt = _CPT_BROWSING["mid_range"]
|
||||||
|
visited: list = []
|
||||||
|
for entry in _BROWSING_POOL:
|
||||||
|
name = entry["name"]
|
||||||
|
p = cpt.get(name, 0.3) # default 0.3 for missing CPT row
|
||||||
|
if rng.random() < p:
|
||||||
|
visited.append(dict(entry)) # copy to avoid mutating pool
|
||||||
|
visited.sort(key=lambda e: e["name"])
|
||||||
|
return visited
|
||||||
|
|
||||||
|
|
||||||
# ═══════════════════════════════════════════════════════════════════════
|
# ═══════════════════════════════════════════════════════════════════════
|
||||||
# PUBLIC API: Forge
|
# PUBLIC API: Forge
|
||||||
# ═══════════════════════════════════════════════════════════════════════
|
# ═══════════════════════════════════════════════════════════════════════
|
||||||
@@ -350,6 +383,12 @@ class Forge:
|
|||||||
bundle["gpu_class"], self._rng
|
bundle["gpu_class"], self._rng
|
||||||
).items()
|
).items()
|
||||||
},
|
},
|
||||||
|
# Bayesian browsing history (per-class P(visited|gpu_class)).
|
||||||
|
# Consumed by _recaptcha_seed.py to seed coherent cookie history
|
||||||
|
# when invisible_playwright is launched with prep_recaptcha=True.
|
||||||
|
"browsing_history": derive_browsing_history(
|
||||||
|
bundle["gpu_class"], self._rng
|
||||||
|
),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -0,0 +1,64 @@
|
|||||||
|
{
|
||||||
|
"_comment": [
|
||||||
|
"Pool of everyday websites used by the browsing_history node.",
|
||||||
|
"Each entry: { name, category, cookie_profile }.",
|
||||||
|
"- name: bare domain (no scheme, no leading dot).",
|
||||||
|
"- category: dev / shop / news / reference / media / community / misc.",
|
||||||
|
"- cookie_profile: short tag pointing to a cookie-template recipe used by",
|
||||||
|
" _recaptcha_seed.py to generate concrete cookies (so heavy-analytics sites",
|
||||||
|
" get _ga+_gid+OneTrust, simple sites get just _ga, dev tools get GH-style).",
|
||||||
|
"Add new entries here + add per-class probabilities in cpt_browsing_given_class.json."
|
||||||
|
],
|
||||||
|
"entries": [
|
||||||
|
{"name": "youtube.com", "category": "media", "cookie_profile": "ga_only"},
|
||||||
|
{"name": "wikipedia.org", "category": "reference", "cookie_profile": "minimal"},
|
||||||
|
{"name": "mozilla.org", "category": "reference", "cookie_profile": "ga_consent"},
|
||||||
|
{"name": "w3schools.com", "category": "dev", "cookie_profile": "ga_consent_clarity"},
|
||||||
|
{"name": "mdn.io", "category": "dev", "cookie_profile": "minimal"},
|
||||||
|
{"name": "duckduckgo.com", "category": "reference", "cookie_profile": "minimal"},
|
||||||
|
{"name": "github.com", "category": "dev", "cookie_profile": "ga_cf"},
|
||||||
|
{"name": "stackoverflow.com", "category": "dev", "cookie_profile": "ga_consent_clarity"},
|
||||||
|
{"name": "npmjs.com", "category": "dev", "cookie_profile": "ga_consent"},
|
||||||
|
{"name": "gitlab.com", "category": "dev", "cookie_profile": "ga_cf"},
|
||||||
|
{"name": "pypi.org", "category": "dev", "cookie_profile": "minimal"},
|
||||||
|
{"name": "docs.python.org", "category": "dev", "cookie_profile": "minimal"},
|
||||||
|
{"name": "rust-lang.org", "category": "dev", "cookie_profile": "ga_consent"},
|
||||||
|
{"name": "go.dev", "category": "dev", "cookie_profile": "ga_consent"},
|
||||||
|
{"name": "amazon.com", "category": "shop", "cookie_profile": "ga_consent_clarity"},
|
||||||
|
{"name": "ebay.com", "category": "shop", "cookie_profile": "ga_consent"},
|
||||||
|
{"name": "etsy.com", "category": "shop", "cookie_profile": "ga_consent_clarity"},
|
||||||
|
{"name": "bestbuy.com", "category": "shop", "cookie_profile": "ga_consent_clarity"},
|
||||||
|
{"name": "target.com", "category": "shop", "cookie_profile": "ga_consent_clarity"},
|
||||||
|
{"name": "nytimes.com", "category": "news", "cookie_profile": "ga_consent_clarity"},
|
||||||
|
{"name": "cnn.com", "category": "news", "cookie_profile": "ga_consent"},
|
||||||
|
{"name": "bbc.com", "category": "news", "cookie_profile": "ga_consent"},
|
||||||
|
{"name": "theguardian.com", "category": "news", "cookie_profile": "ga_consent_clarity"},
|
||||||
|
{"name": "reuters.com", "category": "news", "cookie_profile": "ga_consent"},
|
||||||
|
{"name": "apnews.com", "category": "news", "cookie_profile": "ga_consent"},
|
||||||
|
{"name": "washingtonpost.com", "category": "news", "cookie_profile": "ga_consent"},
|
||||||
|
{"name": "techcrunch.com", "category": "news", "cookie_profile": "ga_consent_clarity"},
|
||||||
|
{"name": "theverge.com", "category": "news", "cookie_profile": "ga_consent"},
|
||||||
|
{"name": "arstechnica.com", "category": "news", "cookie_profile": "ga_consent"},
|
||||||
|
{"name": "wired.com", "category": "news", "cookie_profile": "ga_consent_clarity"},
|
||||||
|
{"name": "engadget.com", "category": "news", "cookie_profile": "ga_consent"},
|
||||||
|
{"name": "9to5mac.com", "category": "news", "cookie_profile": "ga_consent"},
|
||||||
|
{"name": "medium.com", "category": "community", "cookie_profile": "ga_consent"},
|
||||||
|
{"name": "dev.to", "category": "community", "cookie_profile": "ga_consent"},
|
||||||
|
{"name": "reddit.com", "category": "community", "cookie_profile": "ga_cf"},
|
||||||
|
{"name": "news.ycombinator.com", "category": "community", "cookie_profile": "minimal"},
|
||||||
|
{"name": "quora.com", "category": "community", "cookie_profile": "ga_consent_clarity"},
|
||||||
|
{"name": "stackexchange.com", "category": "community", "cookie_profile": "ga_consent_clarity"},
|
||||||
|
{"name": "imdb.com", "category": "media", "cookie_profile": "ga_consent_clarity"},
|
||||||
|
{"name": "rottentomatoes.com", "category": "media", "cookie_profile": "ga_consent"},
|
||||||
|
{"name": "metacritic.com", "category": "media", "cookie_profile": "ga_consent"},
|
||||||
|
{"name": "allrecipes.com", "category": "misc", "cookie_profile": "ga_consent_clarity"},
|
||||||
|
{"name": "epicurious.com", "category": "misc", "cookie_profile": "ga_consent"},
|
||||||
|
{"name": "tripadvisor.com", "category": "misc", "cookie_profile": "ga_consent_clarity"},
|
||||||
|
{"name": "weather.com", "category": "reference", "cookie_profile": "ga_consent"},
|
||||||
|
{"name": "timeanddate.com", "category": "reference", "cookie_profile": "ga_consent"},
|
||||||
|
{"name": "thesaurus.com", "category": "reference", "cookie_profile": "ga_consent_clarity"},
|
||||||
|
{"name": "kayak.com", "category": "shop", "cookie_profile": "ga_consent_clarity"},
|
||||||
|
{"name": "booking.com", "category": "shop", "cookie_profile": "ga_consent_clarity"},
|
||||||
|
{"name": "airbnb.com", "category": "shop", "cookie_profile": "ga_consent"}
|
||||||
|
]
|
||||||
|
}
|
||||||
@@ -0,0 +1,138 @@
|
|||||||
|
{
|
||||||
|
"_comment": [
|
||||||
|
"Per-class probability that a persona of a given gpu_class has visited each",
|
||||||
|
"site in the pool. Used by the browsing_history node to derive a coherent",
|
||||||
|
"visited-domain list per persona.",
|
||||||
|
"",
|
||||||
|
"Probabilities are tuned so each class samples ~15-30 sites on average",
|
||||||
|
"(sum across all 50 entries falls in that range), giving an established-user",
|
||||||
|
"look. Categories are biased by class:",
|
||||||
|
" - workstation/high_end: higher P(dev) + high P(news/media)",
|
||||||
|
" - mid_range: balanced",
|
||||||
|
" - low_end/integrated_*: lower P(dev), higher P(shop/news/reference)",
|
||||||
|
"",
|
||||||
|
"Missing class falls back to mid_range via Node CPT pool fallback."
|
||||||
|
],
|
||||||
|
"table": {
|
||||||
|
"workstation": {
|
||||||
|
"youtube.com": 0.80, "wikipedia.org": 0.85, "mozilla.org": 0.70,
|
||||||
|
"w3schools.com": 0.40, "mdn.io": 0.55, "duckduckgo.com": 0.45,
|
||||||
|
"github.com": 0.95, "stackoverflow.com": 0.90, "npmjs.com": 0.65,
|
||||||
|
"gitlab.com": 0.50, "pypi.org": 0.55, "docs.python.org": 0.60,
|
||||||
|
"rust-lang.org": 0.35, "go.dev": 0.30,
|
||||||
|
"amazon.com": 0.70, "ebay.com": 0.25, "etsy.com": 0.15,
|
||||||
|
"bestbuy.com": 0.45, "target.com": 0.30,
|
||||||
|
"nytimes.com": 0.55, "cnn.com": 0.40, "bbc.com": 0.55,
|
||||||
|
"theguardian.com": 0.45, "reuters.com": 0.40, "apnews.com": 0.30,
|
||||||
|
"washingtonpost.com": 0.40,
|
||||||
|
"techcrunch.com": 0.65, "theverge.com": 0.60, "arstechnica.com": 0.65,
|
||||||
|
"wired.com": 0.50, "engadget.com": 0.35, "9to5mac.com": 0.30,
|
||||||
|
"medium.com": 0.55, "dev.to": 0.40, "reddit.com": 0.70,
|
||||||
|
"news.ycombinator.com": 0.65, "quora.com": 0.20, "stackexchange.com": 0.60,
|
||||||
|
"imdb.com": 0.45, "rottentomatoes.com": 0.25, "metacritic.com": 0.20,
|
||||||
|
"allrecipes.com": 0.20, "epicurious.com": 0.15, "tripadvisor.com": 0.30,
|
||||||
|
"weather.com": 0.55, "timeanddate.com": 0.30, "thesaurus.com": 0.25,
|
||||||
|
"kayak.com": 0.30, "booking.com": 0.35, "airbnb.com": 0.30
|
||||||
|
},
|
||||||
|
"high_end": {
|
||||||
|
"youtube.com": 0.85, "wikipedia.org": 0.80, "mozilla.org": 0.60,
|
||||||
|
"w3schools.com": 0.45, "mdn.io": 0.45, "duckduckgo.com": 0.40,
|
||||||
|
"github.com": 0.85, "stackoverflow.com": 0.80, "npmjs.com": 0.50,
|
||||||
|
"gitlab.com": 0.40, "pypi.org": 0.45, "docs.python.org": 0.50,
|
||||||
|
"rust-lang.org": 0.30, "go.dev": 0.25,
|
||||||
|
"amazon.com": 0.75, "ebay.com": 0.30, "etsy.com": 0.20,
|
||||||
|
"bestbuy.com": 0.50, "target.com": 0.35,
|
||||||
|
"nytimes.com": 0.50, "cnn.com": 0.50, "bbc.com": 0.50,
|
||||||
|
"theguardian.com": 0.40, "reuters.com": 0.35, "apnews.com": 0.30,
|
||||||
|
"washingtonpost.com": 0.35,
|
||||||
|
"techcrunch.com": 0.60, "theverge.com": 0.65, "arstechnica.com": 0.60,
|
||||||
|
"wired.com": 0.50, "engadget.com": 0.40, "9to5mac.com": 0.35,
|
||||||
|
"medium.com": 0.50, "dev.to": 0.35, "reddit.com": 0.75,
|
||||||
|
"news.ycombinator.com": 0.55, "quora.com": 0.25, "stackexchange.com": 0.55,
|
||||||
|
"imdb.com": 0.55, "rottentomatoes.com": 0.35, "metacritic.com": 0.30,
|
||||||
|
"allrecipes.com": 0.25, "epicurious.com": 0.20, "tripadvisor.com": 0.30,
|
||||||
|
"weather.com": 0.55, "timeanddate.com": 0.30, "thesaurus.com": 0.25,
|
||||||
|
"kayak.com": 0.30, "booking.com": 0.40, "airbnb.com": 0.30
|
||||||
|
},
|
||||||
|
"mid_range": {
|
||||||
|
"youtube.com": 0.85, "wikipedia.org": 0.75, "mozilla.org": 0.45,
|
||||||
|
"w3schools.com": 0.40, "mdn.io": 0.30, "duckduckgo.com": 0.35,
|
||||||
|
"github.com": 0.55, "stackoverflow.com": 0.55, "npmjs.com": 0.30,
|
||||||
|
"gitlab.com": 0.25, "pypi.org": 0.25, "docs.python.org": 0.30,
|
||||||
|
"rust-lang.org": 0.15, "go.dev": 0.15,
|
||||||
|
"amazon.com": 0.80, "ebay.com": 0.40, "etsy.com": 0.30,
|
||||||
|
"bestbuy.com": 0.55, "target.com": 0.40,
|
||||||
|
"nytimes.com": 0.45, "cnn.com": 0.55, "bbc.com": 0.45,
|
||||||
|
"theguardian.com": 0.35, "reuters.com": 0.30, "apnews.com": 0.30,
|
||||||
|
"washingtonpost.com": 0.30,
|
||||||
|
"techcrunch.com": 0.45, "theverge.com": 0.50, "arstechnica.com": 0.40,
|
||||||
|
"wired.com": 0.45, "engadget.com": 0.35, "9to5mac.com": 0.30,
|
||||||
|
"medium.com": 0.45, "dev.to": 0.25, "reddit.com": 0.70,
|
||||||
|
"news.ycombinator.com": 0.30, "quora.com": 0.35, "stackexchange.com": 0.40,
|
||||||
|
"imdb.com": 0.60, "rottentomatoes.com": 0.40, "metacritic.com": 0.35,
|
||||||
|
"allrecipes.com": 0.35, "epicurious.com": 0.25, "tripadvisor.com": 0.40,
|
||||||
|
"weather.com": 0.60, "timeanddate.com": 0.25, "thesaurus.com": 0.30,
|
||||||
|
"kayak.com": 0.35, "booking.com": 0.45, "airbnb.com": 0.40
|
||||||
|
},
|
||||||
|
"low_end": {
|
||||||
|
"youtube.com": 0.85, "wikipedia.org": 0.70, "mozilla.org": 0.35,
|
||||||
|
"w3schools.com": 0.30, "mdn.io": 0.20, "duckduckgo.com": 0.30,
|
||||||
|
"github.com": 0.30, "stackoverflow.com": 0.30, "npmjs.com": 0.15,
|
||||||
|
"gitlab.com": 0.10, "pypi.org": 0.10, "docs.python.org": 0.15,
|
||||||
|
"rust-lang.org": 0.05, "go.dev": 0.05,
|
||||||
|
"amazon.com": 0.85, "ebay.com": 0.50, "etsy.com": 0.40,
|
||||||
|
"bestbuy.com": 0.55, "target.com": 0.45,
|
||||||
|
"nytimes.com": 0.40, "cnn.com": 0.60, "bbc.com": 0.40,
|
||||||
|
"theguardian.com": 0.30, "reuters.com": 0.25, "apnews.com": 0.30,
|
||||||
|
"washingtonpost.com": 0.25,
|
||||||
|
"techcrunch.com": 0.30, "theverge.com": 0.35, "arstechnica.com": 0.25,
|
||||||
|
"wired.com": 0.40, "engadget.com": 0.30, "9to5mac.com": 0.25,
|
||||||
|
"medium.com": 0.35, "dev.to": 0.15, "reddit.com": 0.65,
|
||||||
|
"news.ycombinator.com": 0.15, "quora.com": 0.45, "stackexchange.com": 0.25,
|
||||||
|
"imdb.com": 0.65, "rottentomatoes.com": 0.45, "metacritic.com": 0.35,
|
||||||
|
"allrecipes.com": 0.45, "epicurious.com": 0.30, "tripadvisor.com": 0.45,
|
||||||
|
"weather.com": 0.65, "timeanddate.com": 0.25, "thesaurus.com": 0.35,
|
||||||
|
"kayak.com": 0.35, "booking.com": 0.50, "airbnb.com": 0.40
|
||||||
|
},
|
||||||
|
"integrated_modern": {
|
||||||
|
"youtube.com": 0.85, "wikipedia.org": 0.70, "mozilla.org": 0.40,
|
||||||
|
"w3schools.com": 0.35, "mdn.io": 0.25, "duckduckgo.com": 0.35,
|
||||||
|
"github.com": 0.40, "stackoverflow.com": 0.40, "npmjs.com": 0.20,
|
||||||
|
"gitlab.com": 0.15, "pypi.org": 0.20, "docs.python.org": 0.20,
|
||||||
|
"rust-lang.org": 0.10, "go.dev": 0.10,
|
||||||
|
"amazon.com": 0.80, "ebay.com": 0.40, "etsy.com": 0.30,
|
||||||
|
"bestbuy.com": 0.50, "target.com": 0.40,
|
||||||
|
"nytimes.com": 0.40, "cnn.com": 0.55, "bbc.com": 0.45,
|
||||||
|
"theguardian.com": 0.35, "reuters.com": 0.30, "apnews.com": 0.30,
|
||||||
|
"washingtonpost.com": 0.30,
|
||||||
|
"techcrunch.com": 0.40, "theverge.com": 0.45, "arstechnica.com": 0.30,
|
||||||
|
"wired.com": 0.40, "engadget.com": 0.30, "9to5mac.com": 0.25,
|
||||||
|
"medium.com": 0.40, "dev.to": 0.20, "reddit.com": 0.65,
|
||||||
|
"news.ycombinator.com": 0.25, "quora.com": 0.40, "stackexchange.com": 0.35,
|
||||||
|
"imdb.com": 0.60, "rottentomatoes.com": 0.40, "metacritic.com": 0.30,
|
||||||
|
"allrecipes.com": 0.40, "epicurious.com": 0.25, "tripadvisor.com": 0.40,
|
||||||
|
"weather.com": 0.60, "timeanddate.com": 0.25, "thesaurus.com": 0.30,
|
||||||
|
"kayak.com": 0.35, "booking.com": 0.45, "airbnb.com": 0.40
|
||||||
|
},
|
||||||
|
"integrated_old": {
|
||||||
|
"youtube.com": 0.75, "wikipedia.org": 0.65, "mozilla.org": 0.30,
|
||||||
|
"w3schools.com": 0.20, "mdn.io": 0.10, "duckduckgo.com": 0.25,
|
||||||
|
"github.com": 0.15, "stackoverflow.com": 0.20, "npmjs.com": 0.05,
|
||||||
|
"gitlab.com": 0.05, "pypi.org": 0.05, "docs.python.org": 0.10,
|
||||||
|
"rust-lang.org": 0.02, "go.dev": 0.02,
|
||||||
|
"amazon.com": 0.85, "ebay.com": 0.55, "etsy.com": 0.45,
|
||||||
|
"bestbuy.com": 0.55, "target.com": 0.50,
|
||||||
|
"nytimes.com": 0.45, "cnn.com": 0.65, "bbc.com": 0.40,
|
||||||
|
"theguardian.com": 0.30, "reuters.com": 0.25, "apnews.com": 0.35,
|
||||||
|
"washingtonpost.com": 0.30,
|
||||||
|
"techcrunch.com": 0.20, "theverge.com": 0.25, "arstechnica.com": 0.15,
|
||||||
|
"wired.com": 0.30, "engadget.com": 0.20, "9to5mac.com": 0.20,
|
||||||
|
"medium.com": 0.30, "dev.to": 0.05, "reddit.com": 0.55,
|
||||||
|
"news.ycombinator.com": 0.05, "quora.com": 0.55, "stackexchange.com": 0.15,
|
||||||
|
"imdb.com": 0.70, "rottentomatoes.com": 0.50, "metacritic.com": 0.35,
|
||||||
|
"allrecipes.com": 0.55, "epicurious.com": 0.35, "tripadvisor.com": 0.50,
|
||||||
|
"weather.com": 0.70, "timeanddate.com": 0.30, "thesaurus.com": 0.40,
|
||||||
|
"kayak.com": 0.40, "booking.com": 0.55, "airbnb.com": 0.40
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -120,6 +120,11 @@ class Profile:
|
|||||||
webgl: WebGLProfile
|
webgl: WebGLProfile
|
||||||
fonts: List[str]
|
fonts: List[str]
|
||||||
dark_theme: bool
|
dark_theme: bool
|
||||||
|
# Bayesian browsing-history: list of {name, category, cookie_profile}
|
||||||
|
# dicts sampled from data/browsing_pool.json with per-class CPT. Used
|
||||||
|
# by _recaptcha_seed.py to build a coherent cookie pre-seed when the
|
||||||
|
# caller opts in via Stealthfox(prep_recaptcha=True).
|
||||||
|
browsing_history: List[Dict[str, str]] = field(default_factory=list)
|
||||||
_raw: Dict[str, Any] = field(default_factory=dict, repr=False, compare=False)
|
_raw: Dict[str, Any] = field(default_factory=dict, repr=False, compare=False)
|
||||||
|
|
||||||
def to_prefs_dict(self) -> Dict[str, Any]:
|
def to_prefs_dict(self) -> Dict[str, Any]:
|
||||||
@@ -255,5 +260,6 @@ def generate_profile(seed: int, pin: Optional[Dict[str, Any]] = None) -> Profile
|
|||||||
webgl=WebGLProfile(msaa_samples=int(raw["msaa_samples"])),
|
webgl=WebGLProfile(msaa_samples=int(raw["msaa_samples"])),
|
||||||
fonts=fonts,
|
fonts=fonts,
|
||||||
dark_theme=bool(raw["dark_theme"]),
|
dark_theme=bool(raw["dark_theme"]),
|
||||||
|
browsing_history=list(raw.get("browsing_history") or []),
|
||||||
_raw=raw,
|
_raw=raw,
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -0,0 +1,340 @@
|
|||||||
|
"""Deterministic reCAPTCHA cookie pre-seed.
|
||||||
|
|
||||||
|
Consumes the Bayesian-sampled `browsing_history` from the persona Profile
|
||||||
|
(see `_fpforge/_sampler.py:derive_browsing_history`). For each visited
|
||||||
|
site, builds 1-5 realistic cookies whose composition is chosen by the
|
||||||
|
site's `cookie_profile` tag (analytics-only / consent / cloudflare-bot-
|
||||||
|
management / etc.). All values seeded deterministically from the persona
|
||||||
|
seed, so a given persona always presents the SAME cookies across sessions.
|
||||||
|
|
||||||
|
In addition, always seeds 5 cookies on .google.com (NID, CONSENT, SOCS,
|
||||||
|
_GRECAPTCHA, ENID). Excludes 1P_JAR which was deprecated by Google in 2022
|
||||||
|
— including it now is an anachronism flag.
|
||||||
|
|
||||||
|
Public API:
|
||||||
|
await seed_recaptcha_cookies_async(context, profile, timezone=None)
|
||||||
|
seed_recaptcha_cookies_sync(context, profile, timezone=None)
|
||||||
|
|
||||||
|
`profile` is an `_fpforge.Profile`; `timezone` is the IANA tz (e.g.
|
||||||
|
"Europe/Rome") used to derive the CONSENT cookie's language token, so a
|
||||||
|
European-tz persona gets CONSENT in their language not en+FX.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import datetime
|
||||||
|
import random
|
||||||
|
import time
|
||||||
|
from typing import Any, List, Optional
|
||||||
|
|
||||||
|
# URL-safe base64 alphabet (no padding chars).
|
||||||
|
_B64_ALPHABET = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_"
|
||||||
|
_HEX_ALPHABET = "0123456789abcdef"
|
||||||
|
|
||||||
|
|
||||||
|
def _sub_seed(seed: int, tag: str) -> int:
|
||||||
|
"""FNV-1a mix → independent PRNG streams per logical bucket from one seed."""
|
||||||
|
h = 0xcbf29ce484222325 ^ (seed & 0xFFFFFFFF)
|
||||||
|
for c in tag.encode("ascii"):
|
||||||
|
h ^= c
|
||||||
|
h = (h * 0x100000001b3) & 0xFFFFFFFFFFFFFFFF
|
||||||
|
return h or 0xdeadbeef
|
||||||
|
|
||||||
|
|
||||||
|
def _b64_rand(rng: random.Random, length: int) -> str:
|
||||||
|
return "".join(rng.choice(_B64_ALPHABET) for _ in range(length))
|
||||||
|
|
||||||
|
|
||||||
|
def _hex_rand(rng: random.Random, length: int) -> str:
|
||||||
|
return "".join(rng.choice(_HEX_ALPHABET) for _ in range(length))
|
||||||
|
|
||||||
|
|
||||||
|
def _yyyymmdd_utc(ts: int) -> str:
|
||||||
|
return datetime.datetime.utcfromtimestamp(ts).strftime("%Y%m%d")
|
||||||
|
|
||||||
|
|
||||||
|
# IANA timezone -> (country_code, lang) for CONSENT cookie coherence.
|
||||||
|
# Real EU users get CONSENT with `<lang>+<COUNTRY>+NNN`; non-EU gets `en+FX+NNN`.
|
||||||
|
# Default fallback `en+FX+NNN` for any tz not in this map.
|
||||||
|
_TZ_TO_REGION = {
|
||||||
|
"Europe/Rome": ("IT", "it"),
|
||||||
|
"Europe/Berlin": ("DE", "de"),
|
||||||
|
"Europe/Paris": ("FR", "fr"),
|
||||||
|
"Europe/Madrid": ("ES", "es"),
|
||||||
|
"Europe/London": ("GB", "en"),
|
||||||
|
"Europe/Amsterdam": ("NL", "nl"),
|
||||||
|
"Europe/Brussels": ("BE", "fr"),
|
||||||
|
"Europe/Vienna": ("AT", "de"),
|
||||||
|
"Europe/Zurich": ("CH", "de"),
|
||||||
|
"Europe/Dublin": ("IE", "en"),
|
||||||
|
"Europe/Lisbon": ("PT", "pt"),
|
||||||
|
"Europe/Stockholm": ("SE", "sv"),
|
||||||
|
"Europe/Oslo": ("NO", "no"),
|
||||||
|
"Europe/Copenhagen": ("DK", "da"),
|
||||||
|
"Europe/Helsinki": ("FI", "fi"),
|
||||||
|
"Europe/Warsaw": ("PL", "pl"),
|
||||||
|
"Europe/Prague": ("CZ", "cs"),
|
||||||
|
"Europe/Athens": ("GR", "el"),
|
||||||
|
"Asia/Tokyo": ("FX", "ja"),
|
||||||
|
"Asia/Shanghai": ("FX", "zh"),
|
||||||
|
"Asia/Hong_Kong": ("FX", "zh"),
|
||||||
|
"Asia/Seoul": ("FX", "ko"),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _consent_region_lang(timezone: Optional[str]) -> tuple:
|
||||||
|
"""Map IANA tz → (region_token, lang_2char) for CONSENT cookie.
|
||||||
|
Default `("FX", "en")` for US/unknown."""
|
||||||
|
if timezone and timezone in _TZ_TO_REGION:
|
||||||
|
return _TZ_TO_REGION[timezone]
|
||||||
|
return ("FX", "en")
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# .google.com cookie batch (always present, regardless of browsing history)
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def _google_cookies(rng: random.Random, now: int,
|
||||||
|
timezone: Optional[str] = None) -> List[dict]:
|
||||||
|
consent_age = rng.randint(60, 720) * 86400
|
||||||
|
region, lang = _consent_region_lang(timezone)
|
||||||
|
# NID 3-digit prefix range broadened to 100-540 to cover historical NID
|
||||||
|
# versions (137, 105, 511, 525 etc. observed in real captures).
|
||||||
|
return [
|
||||||
|
{"name": "NID",
|
||||||
|
"value": f"{rng.randint(100, 540)}={_b64_rand(rng, 178)}",
|
||||||
|
"domain": ".google.com", "path": "/",
|
||||||
|
"expires": now + 180 * 86400,
|
||||||
|
"httpOnly": True, "secure": True, "sameSite": "None"},
|
||||||
|
{"name": "CONSENT",
|
||||||
|
"value": f"YES+cb.{_yyyymmdd_utc(now - consent_age)}-"
|
||||||
|
f"{rng.randint(10, 19):02d}-p{rng.randint(0, 9)}."
|
||||||
|
f"{lang}+{region}+{rng.randint(100, 999)}",
|
||||||
|
"domain": ".google.com", "path": "/",
|
||||||
|
"expires": now + 395 * 86400,
|
||||||
|
"secure": True, "sameSite": "Lax"},
|
||||||
|
# 1P_JAR removed: Google deprecated it in 2022. Including it now is
|
||||||
|
# an anachronism flag for fingerprinters that look at cookie freshness.
|
||||||
|
{"name": "SOCS",
|
||||||
|
"value": f"CAES{_b64_rand(rng, 56)}",
|
||||||
|
"domain": ".google.com", "path": "/",
|
||||||
|
"expires": now + 395 * 86400,
|
||||||
|
"secure": True, "sameSite": "Lax"},
|
||||||
|
{"name": "_GRECAPTCHA",
|
||||||
|
"value": _b64_rand(rng, 124),
|
||||||
|
"domain": ".google.com", "path": "/",
|
||||||
|
"expires": now + 180 * 86400,
|
||||||
|
"secure": True, "sameSite": "None"},
|
||||||
|
{"name": "ENID",
|
||||||
|
"value": _b64_rand(rng, 252),
|
||||||
|
"domain": ".google.com", "path": "/",
|
||||||
|
"expires": now + 395 * 86400,
|
||||||
|
"httpOnly": True, "secure": True, "sameSite": "Lax"},
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Per-site cookie generators (recipes keyed by site["cookie_profile"])
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def _norm_domain(domain: str) -> str:
|
||||||
|
return domain if domain.startswith(".") else "." + domain
|
||||||
|
|
||||||
|
|
||||||
|
def _ga_cookie(rng: random.Random, now: int, domain: str) -> dict:
|
||||||
|
first_age = rng.randint(7, 395) * 86400
|
||||||
|
return {"name": "_ga",
|
||||||
|
"value": f"GA1.2.{rng.randint(100000000, 999999999)}.{now - first_age}",
|
||||||
|
"domain": domain, "path": "/",
|
||||||
|
"expires": now + 395 * 86400,
|
||||||
|
"secure": True, "sameSite": "Lax"}
|
||||||
|
|
||||||
|
|
||||||
|
def _gid_cookie(rng: random.Random, now: int, domain: str) -> dict:
|
||||||
|
return {"name": "_gid",
|
||||||
|
"value": f"GA1.2.{rng.randint(100000000, 999999999)}.{now - rng.randint(60, 86400)}",
|
||||||
|
"domain": domain, "path": "/",
|
||||||
|
"expires": now + 86400,
|
||||||
|
"secure": True, "sameSite": "Lax"}
|
||||||
|
|
||||||
|
|
||||||
|
def _cf_bm_cookie(rng: random.Random, now: int, domain: str) -> dict:
|
||||||
|
return {"name": "__cf_bm",
|
||||||
|
"value": f"{_b64_rand(rng, 43)}.{rng.randint(1700000000, now)}-1-1-1-1",
|
||||||
|
"domain": domain, "path": "/",
|
||||||
|
"expires": now + 1800,
|
||||||
|
"secure": True, "sameSite": "None"}
|
||||||
|
|
||||||
|
|
||||||
|
def _onetrust_cookie(rng: random.Random, now: int, domain: str) -> dict:
|
||||||
|
age_d = rng.randint(7, 365)
|
||||||
|
iso = datetime.datetime.utcfromtimestamp(now - age_d * 86400).strftime(
|
||||||
|
"%Y-%m-%dT%H:%M:%S.000Z"
|
||||||
|
)
|
||||||
|
return {"name": "OptanonAlertBoxClosed",
|
||||||
|
"value": iso,
|
||||||
|
"domain": domain, "path": "/",
|
||||||
|
"expires": now + 395 * 86400,
|
||||||
|
"secure": True, "sameSite": "Lax"}
|
||||||
|
|
||||||
|
|
||||||
|
def _cookieyes_cookie(rng: random.Random, now: int, domain: str) -> dict:
|
||||||
|
return {"name": "cookieyes-consent",
|
||||||
|
"value": "consentid:" + _b64_rand(rng, 28) +
|
||||||
|
",consent:yes,action:yes,necessary:yes,functional:yes,analytics:yes",
|
||||||
|
"domain": domain, "path": "/",
|
||||||
|
"expires": now + 395 * 86400,
|
||||||
|
"secure": True, "sameSite": "Lax"}
|
||||||
|
|
||||||
|
|
||||||
|
def _clarity_cookie(rng: random.Random, now: int, domain: str) -> dict:
|
||||||
|
return {"name": "_clck",
|
||||||
|
"value": f"{_hex_rand(rng, 8)}|2|f{rng.randint(10, 99)}|0|"
|
||||||
|
f"{now - rng.randint(60, 180) * 86400}",
|
||||||
|
"domain": domain, "path": "/",
|
||||||
|
"expires": now + 365 * 86400,
|
||||||
|
"secure": True, "sameSite": "Lax"}
|
||||||
|
|
||||||
|
|
||||||
|
def _fbp_cookie(rng: random.Random, now: int, domain: str) -> dict:
|
||||||
|
"""Facebook Pixel _fbp = fb.<subdomain_index>.<unix_ms>.<random_int>"""
|
||||||
|
return {"name": "_fbp",
|
||||||
|
"value": f"fb.1.{(now - rng.randint(60, 30*86400)) * 1000}."
|
||||||
|
f"{rng.randint(100000000, 9999999999)}",
|
||||||
|
"domain": domain, "path": "/",
|
||||||
|
"expires": now + 90 * 86400,
|
||||||
|
"secure": True, "sameSite": "Lax"}
|
||||||
|
|
||||||
|
|
||||||
|
def _gtm_cookie(rng: random.Random, now: int, domain: str) -> dict:
|
||||||
|
"""_dc_gtm_<container_id>=1 — Google Tag Manager throttle flag."""
|
||||||
|
container = f"UA-{rng.randint(10000000, 99999999)}-{rng.randint(1, 9)}"
|
||||||
|
return {"name": f"_dc_gtm_{container}",
|
||||||
|
"value": "1",
|
||||||
|
"domain": domain, "path": "/",
|
||||||
|
"expires": now + 60,
|
||||||
|
"secure": True, "sameSite": "Lax"}
|
||||||
|
|
||||||
|
|
||||||
|
def _hssrc_cookie(rng: random.Random, now: int, domain: str) -> dict:
|
||||||
|
"""HubSpot referrer flag — small int."""
|
||||||
|
return {"name": "__hssrc",
|
||||||
|
"value": str(rng.randint(1, 5)),
|
||||||
|
"domain": domain, "path": "/",
|
||||||
|
"expires": now + 1800,
|
||||||
|
"secure": True, "sameSite": "Lax"}
|
||||||
|
|
||||||
|
|
||||||
|
def _cookies_for_profile(profile: str, rng: random.Random,
|
||||||
|
now: int, domain: str) -> List[dict]:
|
||||||
|
"""Map cookie_profile tag (from browsing_pool.json) → concrete cookies.
|
||||||
|
|
||||||
|
Each recipe is a realistic combination observed on real production sites
|
||||||
|
in that category. Cookie age and sub-recipe variance (e.g., OneTrust vs
|
||||||
|
CookieYes for consent banner) are deterministic from rng.
|
||||||
|
"""
|
||||||
|
domain = _norm_domain(domain)
|
||||||
|
if profile == "minimal":
|
||||||
|
return [_ga_cookie(rng, now, domain)]
|
||||||
|
if profile == "ga_only":
|
||||||
|
out = [_ga_cookie(rng, now, domain), _gid_cookie(rng, now, domain)]
|
||||||
|
# 30% chance of GTM helper paired with GA
|
||||||
|
if rng.random() < 0.3:
|
||||||
|
out.append(_gtm_cookie(rng, now, domain))
|
||||||
|
return out
|
||||||
|
if profile == "ga_cf":
|
||||||
|
return [_ga_cookie(rng, now, domain), _cf_bm_cookie(rng, now, domain)]
|
||||||
|
if profile == "ga_consent":
|
||||||
|
out = [_ga_cookie(rng, now, domain), _gid_cookie(rng, now, domain)]
|
||||||
|
out.append(_onetrust_cookie(rng, now, domain) if rng.random() < 0.5
|
||||||
|
else _cookieyes_cookie(rng, now, domain))
|
||||||
|
if rng.random() < 0.4:
|
||||||
|
out.append(_gtm_cookie(rng, now, domain))
|
||||||
|
return out
|
||||||
|
if profile == "ga_consent_clarity":
|
||||||
|
# Heavy-tracking site profile: GA + Clarity + consent + often FB pixel
|
||||||
|
out = [_ga_cookie(rng, now, domain), _gid_cookie(rng, now, domain),
|
||||||
|
_clarity_cookie(rng, now, domain)]
|
||||||
|
out.append(_onetrust_cookie(rng, now, domain) if rng.random() < 0.5
|
||||||
|
else _cookieyes_cookie(rng, now, domain))
|
||||||
|
if rng.random() < 0.5:
|
||||||
|
out.append(_fbp_cookie(rng, now, domain))
|
||||||
|
if rng.random() < 0.4:
|
||||||
|
out.append(_gtm_cookie(rng, now, domain))
|
||||||
|
if rng.random() < 0.25:
|
||||||
|
out.append(_hssrc_cookie(rng, now, domain))
|
||||||
|
return out
|
||||||
|
# Unknown profile → safe fallback
|
||||||
|
return [_ga_cookie(rng, now, domain)]
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Public builder
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def build_cookies(seed: int,
|
||||||
|
browsing_history: Optional[List[dict]] = None,
|
||||||
|
now: Optional[int] = None,
|
||||||
|
timezone: Optional[str] = None) -> List[dict]:
|
||||||
|
"""Build the full cookie list for a persona.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
seed: persona integer seed (from `Profile.seed`)
|
||||||
|
browsing_history: list of {name, category, cookie_profile} dicts as
|
||||||
|
sampled by `_fpforge.derive_browsing_history`. None → empty list
|
||||||
|
(only the 5 google cookies are returned).
|
||||||
|
now: unix-seconds timestamp; defaults to current time. Pin for tests.
|
||||||
|
timezone: IANA tz used to derive CONSENT cookie's `lang+region` token
|
||||||
|
(e.g. "Europe/Rome" → "it+IT", "America/New_York" → "en+FX").
|
||||||
|
"""
|
||||||
|
ts = now if now is not None else int(time.time())
|
||||||
|
cookies: List[dict] = []
|
||||||
|
|
||||||
|
# 5 .google.com cookies (always) — CONSENT lang derived from tz
|
||||||
|
rng_g = random.Random(_sub_seed(int(seed), "google"))
|
||||||
|
cookies.extend(_google_cookies(rng_g, ts, timezone=timezone))
|
||||||
|
|
||||||
|
# Per-site cookies (deterministic from seed × domain)
|
||||||
|
for site in (browsing_history or []):
|
||||||
|
rng_d = random.Random(_sub_seed(int(seed), f"dom:{site['name']}"))
|
||||||
|
cookies.extend(_cookies_for_profile(
|
||||||
|
site.get("cookie_profile", "minimal"), rng_d, ts, site["name"]
|
||||||
|
))
|
||||||
|
return cookies
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_seed_and_history(profile: Any) -> tuple:
|
||||||
|
"""Accept a Profile object OR a (seed, history) tuple OR just an int seed."""
|
||||||
|
if isinstance(profile, int):
|
||||||
|
return int(profile), []
|
||||||
|
seed = int(getattr(profile, "seed"))
|
||||||
|
history = list(getattr(profile, "browsing_history", []) or [])
|
||||||
|
return seed, history
|
||||||
|
|
||||||
|
|
||||||
|
async def seed_recaptcha_cookies_async(context: Any, profile: Any,
|
||||||
|
timezone: Optional[str] = None) -> None:
|
||||||
|
"""Async: inject deterministic persona cookies into the context."""
|
||||||
|
seed, history = _extract_seed_and_history(profile)
|
||||||
|
cookies = build_cookies(seed, history, timezone=timezone)
|
||||||
|
try:
|
||||||
|
await context.add_cookies(cookies)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def seed_recaptcha_cookies_sync(context: Any, profile: Any,
|
||||||
|
timezone: Optional[str] = None) -> None:
|
||||||
|
"""Sync: inject deterministic persona cookies into the context."""
|
||||||
|
seed, history = _extract_seed_and_history(profile)
|
||||||
|
cookies = build_cookies(seed, history, timezone=timezone)
|
||||||
|
try:
|
||||||
|
context.add_cookies(cookies)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"build_cookies",
|
||||||
|
"seed_recaptcha_cookies_async",
|
||||||
|
"seed_recaptcha_cookies_sync",
|
||||||
|
]
|
||||||
@@ -51,6 +51,7 @@ class InvisiblePlaywright:
|
|||||||
extra_prefs: Optional[Dict[str, Any]] = None,
|
extra_prefs: Optional[Dict[str, Any]] = None,
|
||||||
binary_path: Optional[str] = None,
|
binary_path: Optional[str] = None,
|
||||||
profile_dir: Optional[Union[str, Path]] = None,
|
profile_dir: Optional[Union[str, Path]] = None,
|
||||||
|
prep_recaptcha: bool = False,
|
||||||
) -> None:
|
) -> None:
|
||||||
# See sync launcher: `zoom.stealth.fpp.hw_seed` is int32_t — clamp.
|
# See sync launcher: `zoom.stealth.fpp.hw_seed` is int32_t — clamp.
|
||||||
self.seed: int = int(seed) if seed is not None else secrets.randbits(31)
|
self.seed: int = int(seed) if seed is not None else secrets.randbits(31)
|
||||||
@@ -64,6 +65,8 @@ class InvisiblePlaywright:
|
|||||||
self._extra_prefs = extra_prefs
|
self._extra_prefs = extra_prefs
|
||||||
self._binary_path = binary_path
|
self._binary_path = binary_path
|
||||||
self._profile_dir: Optional[Path] = Path(profile_dir) if profile_dir else None
|
self._profile_dir: Optional[Path] = Path(profile_dir) if profile_dir else None
|
||||||
|
# reCAPTCHA pre-seed gated server-side; respect persistent profile.
|
||||||
|
self._prep_recaptcha = bool(prep_recaptcha) and self._profile_dir is None
|
||||||
self._profile: Profile = generate_profile(self.seed, pin=self._pin)
|
self._profile: Profile = generate_profile(self.seed, pin=self._pin)
|
||||||
self._pw: Optional[Playwright] = None
|
self._pw: Optional[Playwright] = None
|
||||||
self._browser: Optional[Browser] = None
|
self._browser: Optional[Browser] = None
|
||||||
@@ -124,12 +127,18 @@ class InvisiblePlaywright:
|
|||||||
def _patch_new_context_defaults(self, browser: Browser) -> None:
|
def _patch_new_context_defaults(self, browser: Browser) -> None:
|
||||||
original = browser.new_context
|
original = browser.new_context
|
||||||
defaults = self._default_context_kwargs()
|
defaults = self._default_context_kwargs()
|
||||||
|
prep = self._prep_recaptcha
|
||||||
|
profile = self._profile # pass the whole Profile (seed + browsing_history)
|
||||||
|
tz = self._timezone # used by _recaptcha_seed for CONSENT lang+region
|
||||||
|
|
||||||
async def patched(**kw):
|
async def patched(**kw):
|
||||||
merged = dict(defaults)
|
merged = dict(defaults)
|
||||||
merged.update(kw)
|
merged.update(kw)
|
||||||
ctx = await original(**merged)
|
ctx = await original(**merged)
|
||||||
_patch_new_page_sleep(ctx)
|
_patch_new_page_sleep(ctx)
|
||||||
|
if prep:
|
||||||
|
from ._recaptcha_seed import seed_recaptcha_cookies_async
|
||||||
|
await seed_recaptcha_cookies_async(ctx, profile, timezone=tz)
|
||||||
return ctx
|
return ctx
|
||||||
|
|
||||||
browser.new_context = patched # type: ignore[assignment]
|
browser.new_context = patched # type: ignore[assignment]
|
||||||
|
|||||||
@@ -113,6 +113,7 @@ class InvisiblePlaywright:
|
|||||||
extra_prefs: Optional[Dict[str, Any]] = None,
|
extra_prefs: Optional[Dict[str, Any]] = None,
|
||||||
binary_path: Optional[str] = None,
|
binary_path: Optional[str] = None,
|
||||||
profile_dir: Optional[Union[str, Path]] = None,
|
profile_dir: Optional[Union[str, Path]] = None,
|
||||||
|
prep_recaptcha: bool = False,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""
|
"""
|
||||||
Args:
|
Args:
|
||||||
@@ -166,6 +167,10 @@ class InvisiblePlaywright:
|
|||||||
self._extra_prefs = extra_prefs
|
self._extra_prefs = extra_prefs
|
||||||
self._binary_path = binary_path
|
self._binary_path = binary_path
|
||||||
self._profile_dir: Optional[Path] = Path(profile_dir) if profile_dir else None
|
self._profile_dir: Optional[Path] = Path(profile_dir) if profile_dir else None
|
||||||
|
# reCAPTCHA cookie pre-seed — opt-in. Gated server-side: if a
|
||||||
|
# persistent profile_dir is in use, respect its existing cookies
|
||||||
|
# and DON'T enable pre-seed (the profile owns its own state).
|
||||||
|
self._prep_recaptcha = bool(prep_recaptcha) and self._profile_dir is None
|
||||||
self._profile: Profile = generate_profile(self.seed, pin=self._pin)
|
self._profile: Profile = generate_profile(self.seed, pin=self._pin)
|
||||||
self._pw: Optional[Playwright] = None
|
self._pw: Optional[Playwright] = None
|
||||||
self._browser: Optional[Browser] = None
|
self._browser: Optional[Browser] = None
|
||||||
@@ -240,12 +245,18 @@ class InvisiblePlaywright:
|
|||||||
"""
|
"""
|
||||||
original = browser.new_context
|
original = browser.new_context
|
||||||
defaults = self._default_context_kwargs()
|
defaults = self._default_context_kwargs()
|
||||||
|
prep = self._prep_recaptcha
|
||||||
|
profile = self._profile # pass the whole Profile (seed + browsing_history)
|
||||||
|
tz = self._timezone # used by _recaptcha_seed for CONSENT lang+region
|
||||||
|
|
||||||
def patched(**kw):
|
def patched(**kw):
|
||||||
merged = dict(defaults)
|
merged = dict(defaults)
|
||||||
merged.update(kw) # user-supplied wins
|
merged.update(kw) # user-supplied wins
|
||||||
ctx = original(**merged)
|
ctx = original(**merged)
|
||||||
_patch_sync_new_page_sleep(ctx)
|
_patch_sync_new_page_sleep(ctx)
|
||||||
|
if prep:
|
||||||
|
from ._recaptcha_seed import seed_recaptcha_cookies_sync
|
||||||
|
seed_recaptcha_cookies_sync(ctx, profile, timezone=tz)
|
||||||
return ctx
|
return ctx
|
||||||
|
|
||||||
browser.new_context = patched # type: ignore[assignment]
|
browser.new_context = patched # type: ignore[assignment]
|
||||||
|
|||||||
@@ -306,17 +306,6 @@ def test_navigator_oscpu_matches_userAgent(page):
|
|||||||
assert "Mac" in oscpu
|
assert "Mac" in oscpu
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.e2e
|
|
||||||
def test_userAgent_contains_appVersion_chromium_only(page):
|
|
||||||
"""Chromium invariant: UA contains appVersion. Firefox uses a short
|
|
||||||
appVersion form so the check is gated on `'chrome' in window`."""
|
|
||||||
if not _ev(page, "'chrome' in window"):
|
|
||||||
pytest.skip("Chromium-only invariant")
|
|
||||||
ua = _ev(page, "navigator.userAgent")
|
|
||||||
av = _ev(page, "navigator.appVersion")
|
|
||||||
assert av in ua
|
|
||||||
|
|
||||||
|
|
||||||
# ===========================================================================
|
# ===========================================================================
|
||||||
# 5. Native function self-toString (creepjs/src/lies/index.ts hasKnownToString)
|
# 5. Native function self-toString (creepjs/src/lies/index.ts hasKnownToString)
|
||||||
# ===========================================================================
|
# ===========================================================================
|
||||||
|
|||||||
@@ -0,0 +1,349 @@
|
|||||||
|
"""Unit tests for the deterministic reCAPTCHA cookie builder.
|
||||||
|
|
||||||
|
Validates the contract:
|
||||||
|
- 6 .google.com cookies always present
|
||||||
|
- Per-site cookies built from a `browsing_history` list (sampled by the
|
||||||
|
Bayesian network in _fpforge)
|
||||||
|
- Determinism: same (seed, history) → identical content
|
||||||
|
- Chrome 400-day cookie cap respected
|
||||||
|
- Playwright add_cookies field requirements satisfied
|
||||||
|
"""
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from invisible_playwright._recaptcha_seed import (
|
||||||
|
build_cookies,
|
||||||
|
_sub_seed,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
pytestmark = pytest.mark.unit
|
||||||
|
|
||||||
|
|
||||||
|
_FIXED_NOW = 1779600000 # 2026-05-23, frozen for determinism
|
||||||
|
|
||||||
|
|
||||||
|
# Sample browsing history for tests (mimics what _fpforge produces).
|
||||||
|
_SAMPLE_HISTORY = [
|
||||||
|
{"name": "github.com", "category": "dev", "cookie_profile": "ga_cf"},
|
||||||
|
{"name": "stackoverflow.com", "category": "dev", "cookie_profile": "ga_consent_clarity"},
|
||||||
|
{"name": "amazon.com", "category": "shop", "cookie_profile": "ga_consent_clarity"},
|
||||||
|
{"name": "wikipedia.org", "category": "reference", "cookie_profile": "minimal"},
|
||||||
|
{"name": "youtube.com", "category": "media", "cookie_profile": "ga_only"},
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
# ===========================================================================
|
||||||
|
# 1. Set composition
|
||||||
|
# ===========================================================================
|
||||||
|
|
||||||
|
def test_only_google_cookies_when_no_history():
|
||||||
|
"""Empty/None history → only the 5 .google.com cookies (1P_JAR removed
|
||||||
|
in realism round 2 — deprecated by Google 2022)."""
|
||||||
|
cookies = build_cookies(seed=42, browsing_history=None, now=_FIXED_NOW)
|
||||||
|
names = sorted(c["name"] for c in cookies)
|
||||||
|
assert names == sorted(["NID", "CONSENT", "SOCS",
|
||||||
|
"_GRECAPTCHA", "ENID"])
|
||||||
|
assert all(c["domain"] == ".google.com" for c in cookies)
|
||||||
|
|
||||||
|
|
||||||
|
def test_browsing_history_adds_host_cookies():
|
||||||
|
"""Each history site contributes 1+ cookies on its domain."""
|
||||||
|
cookies = build_cookies(seed=42, browsing_history=_SAMPLE_HISTORY, now=_FIXED_NOW)
|
||||||
|
google = [c for c in cookies if c["domain"] == ".google.com"]
|
||||||
|
assert len(google) == 5 # 1P_JAR removed
|
||||||
|
|
||||||
|
domains = {c["domain"] for c in cookies if c["domain"] != ".google.com"}
|
||||||
|
for site in _SAMPLE_HISTORY:
|
||||||
|
assert f".{site['name']}" in domains
|
||||||
|
|
||||||
|
|
||||||
|
def test_domain_dot_prefix_normalized():
|
||||||
|
"""All host cookie domains have a leading dot for sub-domain coverage."""
|
||||||
|
cookies = build_cookies(seed=42, browsing_history=_SAMPLE_HISTORY, now=_FIXED_NOW)
|
||||||
|
for c in cookies:
|
||||||
|
assert c["domain"].startswith("."), f"missing dot: {c['domain']}"
|
||||||
|
|
||||||
|
|
||||||
|
# ===========================================================================
|
||||||
|
# 2. Cookie profile recipes (each profile yields the expected cookie set)
|
||||||
|
# ===========================================================================
|
||||||
|
|
||||||
|
def test_profile_minimal_yields_ga_only():
|
||||||
|
history = [{"name": "x.com", "cookie_profile": "minimal"}]
|
||||||
|
cookies = build_cookies(seed=42, browsing_history=history, now=_FIXED_NOW)
|
||||||
|
host = [c for c in cookies if c["domain"] == ".x.com"]
|
||||||
|
names = [c["name"] for c in host]
|
||||||
|
assert names == ["_ga"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_profile_ga_only_yields_ga_and_gid():
|
||||||
|
history = [{"name": "x.com", "cookie_profile": "ga_only"}]
|
||||||
|
cookies = build_cookies(seed=42, browsing_history=history, now=_FIXED_NOW)
|
||||||
|
host = [c for c in cookies if c["domain"] == ".x.com"]
|
||||||
|
names = sorted(c["name"] for c in host)
|
||||||
|
assert names == ["_ga", "_gid"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_profile_ga_cf_yields_ga_and_cf_bm():
|
||||||
|
history = [{"name": "x.com", "cookie_profile": "ga_cf"}]
|
||||||
|
cookies = build_cookies(seed=42, browsing_history=history, now=_FIXED_NOW)
|
||||||
|
host = [c for c in cookies if c["domain"] == ".x.com"]
|
||||||
|
names = sorted(c["name"] for c in host)
|
||||||
|
assert names == ["__cf_bm", "_ga"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_profile_ga_consent_yields_three_cookies():
|
||||||
|
history = [{"name": "x.com", "cookie_profile": "ga_consent"}]
|
||||||
|
cookies = build_cookies(seed=42, browsing_history=history, now=_FIXED_NOW)
|
||||||
|
host = [c for c in cookies if c["domain"] == ".x.com"]
|
||||||
|
names = sorted(c["name"] for c in host)
|
||||||
|
# Always _ga + _gid + one of OneTrust|CookieYes
|
||||||
|
assert "_ga" in names and "_gid" in names
|
||||||
|
assert any(n in names for n in ("OptanonAlertBoxClosed", "cookieyes-consent"))
|
||||||
|
assert len(host) == 3
|
||||||
|
|
||||||
|
|
||||||
|
def test_profile_ga_consent_clarity_yields_at_least_four_cookies():
|
||||||
|
"""Always _ga + _gid + _clck + consent banner. Optionally _fbp, _dc_gtm_*,
|
||||||
|
__hssrc (probabilistic per rng — see test_new_helper_cookies_*)."""
|
||||||
|
history = [{"name": "x.com", "cookie_profile": "ga_consent_clarity"}]
|
||||||
|
cookies = build_cookies(seed=42, browsing_history=history, now=_FIXED_NOW)
|
||||||
|
host = [c for c in cookies if c["domain"] == ".x.com"]
|
||||||
|
names = sorted(c["name"] for c in host)
|
||||||
|
assert "_ga" in names and "_gid" in names and "_clck" in names
|
||||||
|
assert any(n in names for n in ("OptanonAlertBoxClosed", "cookieyes-consent"))
|
||||||
|
assert len(host) >= 4 # 4 baseline + 0-3 helpers
|
||||||
|
|
||||||
|
|
||||||
|
def test_unknown_profile_falls_back_to_ga():
|
||||||
|
history = [{"name": "x.com", "cookie_profile": "nonexistent_profile"}]
|
||||||
|
cookies = build_cookies(seed=42, browsing_history=history, now=_FIXED_NOW)
|
||||||
|
host = [c for c in cookies if c["domain"] == ".x.com"]
|
||||||
|
assert [c["name"] for c in host] == ["_ga"]
|
||||||
|
|
||||||
|
|
||||||
|
# ===========================================================================
|
||||||
|
# 3. Determinism
|
||||||
|
# ===========================================================================
|
||||||
|
|
||||||
|
def test_same_seed_and_history_same_content():
|
||||||
|
a = build_cookies(seed=42, browsing_history=_SAMPLE_HISTORY, now=_FIXED_NOW)
|
||||||
|
b = build_cookies(seed=42, browsing_history=_SAMPLE_HISTORY, now=_FIXED_NOW)
|
||||||
|
assert a == b
|
||||||
|
|
||||||
|
|
||||||
|
def test_different_seed_different_content():
|
||||||
|
a = build_cookies(seed=42, browsing_history=_SAMPLE_HISTORY, now=_FIXED_NOW)
|
||||||
|
b = build_cookies(seed=99, browsing_history=_SAMPLE_HISTORY, now=_FIXED_NOW)
|
||||||
|
a_nid = next(c for c in a if c["name"] == "NID")["value"]
|
||||||
|
b_nid = next(c for c in b if c["name"] == "NID")["value"]
|
||||||
|
assert a_nid != b_nid
|
||||||
|
|
||||||
|
|
||||||
|
def test_history_order_does_not_affect_domain_specific_cookies():
|
||||||
|
"""Sub-seed is keyed on domain name, not order in history list."""
|
||||||
|
h1 = [_SAMPLE_HISTORY[0], _SAMPLE_HISTORY[1]]
|
||||||
|
h2 = [_SAMPLE_HISTORY[1], _SAMPLE_HISTORY[0]]
|
||||||
|
a = {(c["domain"], c["name"]): c["value"]
|
||||||
|
for c in build_cookies(seed=42, browsing_history=h1, now=_FIXED_NOW)
|
||||||
|
if c["domain"] != ".google.com"}
|
||||||
|
b = {(c["domain"], c["name"]): c["value"]
|
||||||
|
for c in build_cookies(seed=42, browsing_history=h2, now=_FIXED_NOW)
|
||||||
|
if c["domain"] != ".google.com"}
|
||||||
|
assert a == b
|
||||||
|
|
||||||
|
|
||||||
|
def test_sub_seed_distinct_tags_distinct_streams():
|
||||||
|
assert _sub_seed(42, "google") != _sub_seed(42, "dom:github.com")
|
||||||
|
assert _sub_seed(42, "dom:github.com") != _sub_seed(42, "dom:amazon.com")
|
||||||
|
assert _sub_seed(0, "any") != 0 # seed=0 still produces non-zero sub-seed
|
||||||
|
|
||||||
|
|
||||||
|
# ===========================================================================
|
||||||
|
# 4. Format / structural correctness for the Google batch
|
||||||
|
# ===========================================================================
|
||||||
|
|
||||||
|
def test_nid_format():
|
||||||
|
cookies = build_cookies(seed=42, now=_FIXED_NOW)
|
||||||
|
nid = next(c for c in cookies if c["name"] == "NID")
|
||||||
|
prefix, b64 = nid["value"].split("=", 1)
|
||||||
|
assert prefix.isdigit() and len(prefix) == 3
|
||||||
|
# Broadened to 100-540 in realism round 2 to cover historical NID versions
|
||||||
|
assert 100 <= int(prefix) <= 540
|
||||||
|
assert len(b64) == 178
|
||||||
|
|
||||||
|
|
||||||
|
def test_consent_format():
|
||||||
|
cookies = build_cookies(seed=42, now=_FIXED_NOW)
|
||||||
|
consent = next(c for c in cookies if c["name"] == "CONSENT")
|
||||||
|
assert consent["value"].startswith("YES+cb.")
|
||||||
|
assert "+FX+" in consent["value"]
|
||||||
|
|
||||||
|
|
||||||
|
# ===========================================================================
|
||||||
|
# 5. Chrome 400-day cookie cap compliance
|
||||||
|
# ===========================================================================
|
||||||
|
|
||||||
|
def test_all_expiries_within_400_day_cap():
|
||||||
|
"""Chrome 104+ caps cookie expiry to 400 days. Cookies > 400d silently
|
||||||
|
truncated / dropped. We tighten everything to <=395d (except __cf_bm
|
||||||
|
which is short-lived telemetry)."""
|
||||||
|
cookies = build_cookies(seed=42, browsing_history=_SAMPLE_HISTORY, now=_FIXED_NOW)
|
||||||
|
max_allowed = _FIXED_NOW + 400 * 86400
|
||||||
|
for c in cookies:
|
||||||
|
# Short-lived telemetry cookies are fine
|
||||||
|
if c["name"] in ("__cf_bm", "1P_JAR", "_gid"):
|
||||||
|
continue
|
||||||
|
assert c["expires"] <= max_allowed, (
|
||||||
|
f"Cookie {c['name']} expires {c['expires'] - _FIXED_NOW}s "
|
||||||
|
f"(> 400d cap) — would be silently dropped"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ===========================================================================
|
||||||
|
# 6. Playwright add_cookies field requirements
|
||||||
|
# ===========================================================================
|
||||||
|
|
||||||
|
def test_all_cookies_have_required_playwright_fields():
|
||||||
|
cookies = build_cookies(seed=42, browsing_history=_SAMPLE_HISTORY, now=_FIXED_NOW)
|
||||||
|
for c in cookies:
|
||||||
|
assert c.get("name"), f"missing name: {c}"
|
||||||
|
assert c.get("value") is not None, f"missing value: {c}"
|
||||||
|
assert c.get("domain"), f"missing domain: {c}"
|
||||||
|
assert c.get("path") == "/", f"path != / for {c['name']}"
|
||||||
|
|
||||||
|
|
||||||
|
def test_modern_cookies_marked_secure():
|
||||||
|
"""Cookies with sameSite=None require secure=True under Firefox/Chrome.
|
||||||
|
Also generally needed for cookies set via Playwright add_cookies without
|
||||||
|
a navigation context."""
|
||||||
|
cookies = build_cookies(seed=42, browsing_history=_SAMPLE_HISTORY, now=_FIXED_NOW)
|
||||||
|
for c in cookies:
|
||||||
|
if c.get("sameSite") == "None":
|
||||||
|
assert c.get("secure") is True, f"{c['name']} None+!secure invalid"
|
||||||
|
|
||||||
|
|
||||||
|
def test_httponly_on_signed_cookies():
|
||||||
|
cookies = build_cookies(seed=42, now=_FIXED_NOW)
|
||||||
|
nid = next(c for c in cookies if c["name"] == "NID")
|
||||||
|
enid = next(c for c in cookies if c["name"] == "ENID")
|
||||||
|
assert nid.get("httpOnly") is True
|
||||||
|
assert enid.get("httpOnly") is True
|
||||||
|
|
||||||
|
|
||||||
|
# ===========================================================================
|
||||||
|
# 7. End-to-end with real fpforge Profile
|
||||||
|
# ===========================================================================
|
||||||
|
|
||||||
|
def test_with_real_fpforge_profile():
|
||||||
|
"""End-to-end: generate a real Profile, ensure browsing_history is populated
|
||||||
|
and build_cookies works against it."""
|
||||||
|
from invisible_playwright._fpforge import generate_profile
|
||||||
|
prof = generate_profile(seed=42)
|
||||||
|
assert isinstance(prof.browsing_history, list)
|
||||||
|
# The Bayesian network samples ~15-30 sites per persona
|
||||||
|
assert 5 <= len(prof.browsing_history) <= 50, \
|
||||||
|
f"unexpected history length: {len(prof.browsing_history)}"
|
||||||
|
# Each entry has the expected fields
|
||||||
|
for site in prof.browsing_history:
|
||||||
|
assert "name" in site and "category" in site and "cookie_profile" in site
|
||||||
|
# build_cookies works against the real profile
|
||||||
|
cookies = build_cookies(seed=prof.seed, browsing_history=prof.browsing_history,
|
||||||
|
now=_FIXED_NOW)
|
||||||
|
# 6 google + at least 1 cookie per visited site
|
||||||
|
assert len(cookies) >= 6 + len(prof.browsing_history)
|
||||||
|
|
||||||
|
|
||||||
|
def test_same_seed_same_browsing_history_via_fpforge():
|
||||||
|
"""Profile.browsing_history is deterministic from seed (Bayesian sampler)."""
|
||||||
|
from invisible_playwright._fpforge import generate_profile
|
||||||
|
a = generate_profile(seed=42).browsing_history
|
||||||
|
b = generate_profile(seed=42).browsing_history
|
||||||
|
assert a == b
|
||||||
|
|
||||||
|
|
||||||
|
# ===========================================================================
|
||||||
|
# 8. Realism improvements (2026-05-24 round 2)
|
||||||
|
# ===========================================================================
|
||||||
|
|
||||||
|
def test_no_1p_jar_cookie():
|
||||||
|
"""1P_JAR was deprecated by Google in 2022. Including it is an
|
||||||
|
anachronism flag for fingerprinters that look at cookie freshness."""
|
||||||
|
cookies = build_cookies(seed=42, browsing_history=_SAMPLE_HISTORY, now=_FIXED_NOW)
|
||||||
|
names = {c["name"] for c in cookies}
|
||||||
|
assert "1P_JAR" not in names
|
||||||
|
|
||||||
|
|
||||||
|
def test_nid_prefix_broadened_range():
|
||||||
|
"""NID 3-digit prefix should cover historical versions (137/105/511/525
|
||||||
|
seen in real captures) — range 100-540, not just 500-540."""
|
||||||
|
seen_prefixes = set()
|
||||||
|
for seed in range(200):
|
||||||
|
cookies = build_cookies(seed=seed, now=_FIXED_NOW)
|
||||||
|
nid = next(c for c in cookies if c["name"] == "NID")
|
||||||
|
prefix = int(nid["value"].split("=", 1)[0])
|
||||||
|
seen_prefixes.add(prefix)
|
||||||
|
assert min(seen_prefixes) < 500, f"NID range never goes below 500 ({sorted(seen_prefixes)[:5]})"
|
||||||
|
assert max(seen_prefixes) <= 540
|
||||||
|
|
||||||
|
|
||||||
|
def test_consent_lang_from_timezone_eu():
|
||||||
|
"""CONSENT cookie's `lang+region` token derived from IANA timezone."""
|
||||||
|
cookies = build_cookies(seed=42, now=_FIXED_NOW, timezone="Europe/Rome")
|
||||||
|
consent = next(c for c in cookies if c["name"] == "CONSENT")
|
||||||
|
assert ".it+IT+" in consent["value"], f"expected it+IT in: {consent['value']}"
|
||||||
|
|
||||||
|
|
||||||
|
def test_consent_lang_default_fx():
|
||||||
|
"""Unknown / US timezone → default `en+FX` (non-EU fallback)."""
|
||||||
|
cookies = build_cookies(seed=42, now=_FIXED_NOW, timezone="America/New_York")
|
||||||
|
consent = next(c for c in cookies if c["name"] == "CONSENT")
|
||||||
|
assert ".en+FX+" in consent["value"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_consent_lang_de_for_berlin():
|
||||||
|
cookies = build_cookies(seed=42, now=_FIXED_NOW, timezone="Europe/Berlin")
|
||||||
|
consent = next(c for c in cookies if c["name"] == "CONSENT")
|
||||||
|
assert ".de+DE+" in consent["value"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_consent_lang_no_timezone_default():
|
||||||
|
"""timezone=None → default en+FX."""
|
||||||
|
cookies = build_cookies(seed=42, now=_FIXED_NOW)
|
||||||
|
consent = next(c for c in cookies if c["name"] == "CONSENT")
|
||||||
|
assert ".en+FX+" in consent["value"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_new_helper_cookies_appear_in_ga_consent_clarity():
|
||||||
|
"""ga_consent_clarity recipe should sometimes include _fbp, _dc_gtm_*, __hssrc
|
||||||
|
(probabilistic per rng). Check across many seeds that they appear."""
|
||||||
|
saw_fbp = False
|
||||||
|
saw_gtm = False
|
||||||
|
saw_hssrc = False
|
||||||
|
history = [{"name": "site.com", "cookie_profile": "ga_consent_clarity"}]
|
||||||
|
for seed in range(100):
|
||||||
|
cookies = build_cookies(seed=seed, browsing_history=history, now=_FIXED_NOW)
|
||||||
|
names = {c["name"] for c in cookies if c["domain"] == ".site.com"}
|
||||||
|
if "_fbp" in names: saw_fbp = True
|
||||||
|
if any(n.startswith("_dc_gtm_") for n in names): saw_gtm = True
|
||||||
|
if "__hssrc" in names: saw_hssrc = True
|
||||||
|
assert saw_fbp, "_fbp never appeared in 100 seeds (rng pick broken)"
|
||||||
|
assert saw_gtm, "_dc_gtm_* never appeared in 100 seeds"
|
||||||
|
assert saw_hssrc, "__hssrc never appeared in 100 seeds"
|
||||||
|
|
||||||
|
|
||||||
|
def test_fbp_format():
|
||||||
|
"""_fbp format: fb.<idx>.<unix_ms>.<random_int>"""
|
||||||
|
history = [{"name": "x.com", "cookie_profile": "ga_consent_clarity"}]
|
||||||
|
# Try multiple seeds until we hit a seed that includes _fbp (50% chance)
|
||||||
|
for seed in range(20):
|
||||||
|
cookies = build_cookies(seed=seed, browsing_history=history, now=_FIXED_NOW)
|
||||||
|
fbp = next((c for c in cookies if c["name"] == "_fbp"), None)
|
||||||
|
if fbp:
|
||||||
|
parts = fbp["value"].split(".")
|
||||||
|
assert parts[0] == "fb"
|
||||||
|
assert parts[1].isdigit()
|
||||||
|
assert parts[2].isdigit() and len(parts[2]) >= 13 # unix ms
|
||||||
|
assert parts[3].isdigit()
|
||||||
|
return
|
||||||
|
raise AssertionError("never got _fbp across 20 seeds — distribution broken")
|
||||||
Reference in New Issue
Block a user