# WebWright robots.txt
# 2026 best practice: be explicit about every crawler we care about. The
# pre-2024 norm of "allow everything" no longer maps onto a web where
# generative-AI crawlers, search-index crawlers, and OAI-PMH-style data
# scrapers behave very differently. Sources:
#   - Google's robots.txt spec (RFC 9309 + extensions)
#   - OpenAI's published GPTBot policy: https://platform.openai.com/docs/gptbot
#   - Anthropic ClaudeBot: https://www.anthropic.com/claudebot
#   - Common Crawl: https://commoncrawl.org/big-picture/frequently-asked-questions

# -----------------------------------------------------------------------------
# Default policy: allow general web crawlers (Google, Bing, DuckDuckGo, etc.).
# Disallow auth and API surfaces that have no organic-search value and that
# we already mark noindex at the meta level.
# -----------------------------------------------------------------------------
User-agent: *
Disallow: /api/
Disallow: /login

# -----------------------------------------------------------------------------
# Generative-AI training crawlers. We currently permit them; flip Allow to
# Disallow for any vendor whose terms or attribution practices we object to.
# Being explicit avoids ambiguity from "User-agent: *" alone and creates a
# clear audit trail when policy changes.
# -----------------------------------------------------------------------------

# OpenAI: model-training crawler
User-agent: GPTBot
Allow: /
Disallow: /api/
Disallow: /login

# OpenAI: in-product browsing on behalf of ChatGPT users
User-agent: ChatGPT-User
Allow: /
Disallow: /api/
Disallow: /login

# OpenAI: SearchGPT index crawler
User-agent: OAI-SearchBot
Allow: /
Disallow: /api/
Disallow: /login

# Anthropic
User-agent: ClaudeBot
Allow: /
Disallow: /api/
Disallow: /login

User-agent: anthropic-ai
Allow: /
Disallow: /api/
Disallow: /login

User-agent: Claude-Web
Allow: /
Disallow: /api/
Disallow: /login

# Perplexity
User-agent: PerplexityBot
Allow: /
Disallow: /api/
Disallow: /login

# Google's AI training crawler (separate from Googlebot)
User-agent: Google-Extended
Allow: /
Disallow: /api/
Disallow: /login

# ByteDance / TikTok
User-agent: Bytespider
Allow: /
Disallow: /api/
Disallow: /login

# Common Crawl (feeds many downstream datasets)
User-agent: CCBot
Allow: /
Disallow: /api/
Disallow: /login

# Meta AI
User-agent: Meta-ExternalAgent
Allow: /
Disallow: /api/
Disallow: /login

# Apple AI
User-agent: Applebot-Extended
Allow: /
Disallow: /api/
Disallow: /login

Sitemap: https://webwright.ai/sitemap.xml