From eb3a9d443c065e8519351689e063046f52755c0e Mon Sep 17 00:00:00 2001 From: John Bowdre Date: Thu, 13 Jun 2024 15:52:11 -0500 Subject: [PATCH] update post with apple ai bot name, formatting fixes --- content/posts/blocking-ai-crawlers/index.md | 25 +++++++++++++-------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/content/posts/blocking-ai-crawlers/index.md b/content/posts/blocking-ai-crawlers/index.md index 1e9beb9..99d687a 100644 --- a/content/posts/blocking-ai-crawlers/index.md +++ b/content/posts/blocking-ai-crawlers/index.md @@ -1,7 +1,7 @@ --- title: "Blocking AI Crawlers" date: 2024-04-12 -lastmod: "2024-04-14T02:21:57Z" +lastmod: "2024-06-13T20:51:54Z" description: "Using Hugo to politely ask AI bots to not steal my content - and then configuring Cloudflare's WAF to actively block them, just to be sure." featured: false toc: true @@ -24,7 +24,7 @@ robots = [ "AdsBot-Google", "Amazonbot", "anthropic-ai", - "Applebot", + "Applebot-Extended", "AwarioRssBot", "AwarioSmartBot", "Bytespider", @@ -47,9 +47,6 @@ robots = [ "PerplexityBot", "YouBot" ] - -[author] -name = "John Bowdre" ``` I then created a new template in `layouts/robots.txt`: @@ -57,9 +54,14 @@ I then created a new template in `layouts/robots.txt`: ```text Sitemap: {{ .Site.BaseURL }}/sitemap.xml +# hello robots [^_^] +# let's be friends <3 + User-agent: * Disallow: -{{ range .Site.Params.robots }} + +# except for these bots which are not friends: +{{ range .Site.Params.bad_robots }} User-agent: {{ . }} {{- end }} Disallow: / @@ -74,15 +76,20 @@ enableRobotsTXT = true Now Hugo will generate the following `robots.txt` file for me: ```text -Sitemap: https://runtimeterror.dev//sitemap.xml +Sitemap: https://runtimeterror.dev/sitemap.xml + +# hello robots [^_^] +# let's be friends <3 User-agent: * Disallow: +# except for these bots which are not friends: + User-agent: AdsBot-Google User-agent: Amazonbot User-agent: anthropic-ai -User-agent: Applebot +User-agent: Applebot-Extended User-agent: AwarioRssBot User-agent: AwarioSmartBot User-agent: Bytespider @@ -129,7 +136,7 @@ So I added a [WAF Custom Rule](https://developers.cloudflare.com/waf/custom-rule Here's the expression I'm using: ```text -(http.user_agent contains "AdsBot-Google") or (http.user_agent contains "Amazonbot") or (http.user_agent contains "anthropic-ai") or (http.user_agent contains "Applebot") or (http.user_agent contains "AwarioRssBot") or (http.user_agent contains "AwarioSmartBot") or (http.user_agent contains "Bytespider") or (http.user_agent contains "CCBot") or (http.user_agent contains "ChatGPT-User") or (http.user_agent contains "ClaudeBot") or (http.user_agent contains "Claude-Web") or (http.user_agent contains "cohere-ai") or (http.user_agent contains "DataForSeoBot") or (http.user_agent contains "FacebookBot") or (http.user_agent contains "Google-Extended") or (http.user_agent contains "GoogleOther") or (http.user_agent contains "GPTBot") or (http.user_agent contains "ImagesiftBot") or (http.user_agent contains "magpie-crawler") or (http.user_agent contains "Meltwater") or (http.user_agent contains "omgili") or (http.user_agent contains "omgilibot") or (http.user_agent contains "peer39_crawler") or (http.user_agent contains "peer39_crawler/1.0") or (http.user_agent contains "PerplexityBot") or (http.user_agent contains "Seekr") or (http.user_agent contains "YouBot") +(http.user_agent contains "AdsBot-Google") or (http.user_agent contains "Amazonbot") or (http.user_agent contains "anthropic-ai") or (http.user_agent contains "Applebot-Extended") or (http.user_agent contains "AwarioRssBot") or (http.user_agent contains "AwarioSmartBot") or (http.user_agent contains "Bytespider") or (http.user_agent contains "CCBot") or (http.user_agent contains "ChatGPT-User") or (http.user_agent contains "ClaudeBot") or (http.user_agent contains "Claude-Web") or (http.user_agent contains "cohere-ai") or (http.user_agent contains "DataForSeoBot") or (http.user_agent contains "FacebookBot") or (http.user_agent contains "Google-Extended") or (http.user_agent contains "GoogleOther") or (http.user_agent contains "GPTBot") or (http.user_agent contains "ImagesiftBot") or (http.user_agent contains "magpie-crawler") or (http.user_agent contains "Meltwater") or (http.user_agent contains "omgili") or (http.user_agent contains "omgilibot") or (http.user_agent contains "peer39_crawler") or (http.user_agent contains "peer39_crawler/1.0") or (http.user_agent contains "PerplexityBot") or (http.user_agent contains "Seekr") or (http.user_agent contains "YouBot") ``` ![Creating a custom WAF rule in Cloudflare's web UI](cloudflare-waf-rule.png)