From bff5e76ab59e4fa6d08527148c5f8951592a7ae4 Mon Sep 17 00:00:00 2001 From: Eric Scott Date: Wed, 25 Sep 2024 10:53:29 -0700 Subject: [PATCH] update robots.txt --- robots.txt | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/robots.txt b/robots.txt index d2c7d51..d7fc4b7 100644 --- a/robots.txt +++ b/robots.txt @@ -1,4 +1,6 @@ -# source: https://neil-clarke.com/block-the-bots-that-feed-ai-models-by-scraping-your-website/ +# sources: +# https://www.cyberciti.biz/web-developer/block-openai-bard-bing-ai-crawler-bots-using-robots-txt-file/ +# https://neil-clarke.com/block-the-bots-that-feed-ai-models-by-scraping-your-website/ # Data from Common Crawl is used to train ChatGPT, Bard, etc. User-agent: CCBot @@ -27,12 +29,23 @@ Disallow: / User-agent: FacebookBot Disallow: / +# Anthropic AI (Claude) User-agent: anthropic-ai Disallow: / +User-agent: Claude-Web +Disallow: / + +User-agent: ClaudeBot +Disallow: / + # ByteDance's bot for gathering LLM training data User-agent: Bytespider Disallow: / User-agent: ImagesiftBot Disallow: / + +# Takes content and re-writes it using genAI +User-agent: PerplexityBot +Disallow: /