# ------------------------------------------------------------
# Pactolio robots.txt  ·  https://pactolio.com
# Last-updated: 2026-05-25
# ------------------------------------------------------------
# Structure:
#   1. Traditional search crawlers   (Googlebot, Bingbot)
#   2. GEO / AI crawlers             (Google-Extended, GPTBot,
#                                     OAI-SearchBot, ClaudeBot,
#                                     PerplexityBot, GoogleOther,
#                                     Applebot-Extended, Amazonbot,
#                                     Meta-ExternalAgent, cohere-ai,
#                                     CCBot)
#   3. Default wildcard              (everything else)
#   4. Sitemap + llms.txt
#
# Note on Gemini: Google does NOT publish a "GeminiBot" user agent.
# Gemini training + Gemini Apps + AI Overviews grounding all use the
# `Google-Extended` token (stanza below). Adding a GeminiBot stanza
# would be a no-op.
# Note on Anthropic: `anthropic-ai` is a legacy UA superseded by
# `ClaudeBot` (already explicitly permitted). No separate stanza.
#
# Crawl-budget guard (all named bots):
#   ?quarter=  — client-side state pushed by history.pushState
#               on pSEO detail pages; not distinct content. KEEP.
#   /api/*     — JSON endpoints; not in sitemap, not indexed. KEEP.
#
# History (2026-05-25): The ?tab=, ?sort=, ?filter=, ?page= Disallow
# rules were dropped because (a) those URLs had near-zero impressions
# pre-incident, (b) they were a contributing concern in the May-17
# traffic-drop diagnostic, and (c) any future UI filter that emits
# such query strings will inject a canonical/noindex on the variant
# anyway. The root cause of the May-18 impression collapse was meta-
# description spinning in seo-phrasing.ts, NOT these Disallow rules —
# see Explanations/post-mortems/2026-05-25-impression-drop.md. The
# revert is a cleanliness fix, not a recovery driver.
# ------------------------------------------------------------


# ── 1. Googlebot ─────────────────────────────────────────────
User-agent: Googlebot
Allow: /
Disallow: /api/
Disallow: /*?quarter=

# ── 2. Bingbot ───────────────────────────────────────────────
User-agent: Bingbot
Allow: /
Disallow: /api/
Disallow: /*?quarter=


# ── GEO / AI crawlers ─────────────────────────────────────────
# Explicitly permitted on the full public surface.
# Same crawl-budget guards apply — no need to waste token budget
# on hundreds of ?quarter= variants of the same fund page.

User-agent: Google-Extended
Allow: /
Disallow: /api/
Disallow: /*?quarter=

User-agent: GPTBot
Allow: /
Disallow: /api/
Disallow: /*?quarter=

User-agent: OAI-SearchBot
Allow: /
Disallow: /api/
Disallow: /*?quarter=

User-agent: ClaudeBot
Allow: /
Disallow: /api/
Disallow: /*?quarter=

User-agent: PerplexityBot
Allow: /
Disallow: /api/
Disallow: /*?quarter=

# GoogleOther — Google's general-purpose research/AI crawler,
# distinct from Googlebot and Google-Extended.
User-agent: GoogleOther
Allow: /
Disallow: /api/
Disallow: /*?quarter=

# Applebot-Extended — Apple Intelligence / Siri grounding.
User-agent: Applebot-Extended
Allow: /
Disallow: /api/
Disallow: /*?quarter=

# Amazonbot — Alexa AI + Amazon search.
User-agent: Amazonbot
Allow: /
Disallow: /api/
Disallow: /*?quarter=

# Meta-ExternalAgent — Meta AI / Llama features.
User-agent: Meta-ExternalAgent
Allow: /
Disallow: /api/
Disallow: /*?quarter=

# cohere-ai — Cohere enterprise LLMs.
User-agent: cohere-ai
Allow: /
Disallow: /api/
Disallow: /*?quarter=

# CCBot — Common Crawl, source dataset for many open-source LLMs.
User-agent: CCBot
Allow: /
Disallow: /api/
Disallow: /*?quarter=


# ── 3. Default wildcard ──────────────────────────────────────
# All other crawlers: full access with the same parameter guards.
User-agent: *
Allow: /
Disallow: /api/
Disallow: /*?quarter=


# ── 4. Sitemap + LLMs ────────────────────────────────────────
Sitemap: https://pactolio.com/sitemap-index.xml

# Plain-language site manifest for AI crawlers (llms.txt spec).
# llms.txt: https://pactolio.com/llms.txt