User-agent: *
Disallow: /noindex/
Disallow: /misc/
Disallow: /~strawberry/
Disallow: .git

User-agent: *
# git.girlcock.ceo stuff
# from https://git.gay/gitgay/assets/src/branch/main/public/robots.txt
Disallow: /api/*
Disallow: /avatars
Disallow: /user/*
Disallow: /*/*/src/commit/*
Disallow: /*/*/commit/*
Disallow: /*/*/*/refs/*
Disallow: /*/*/*/star
Disallow: /*/*/*/watch
Disallow: /*/*/labels
Disallow: /*/*/activity/*
Disallow: /vendor/*
Disallow: /swagger.*.json

Disallow: /explore/*?*

Disallow: /repo/create
Disallow: /repo/migrate
Disallow: /org/create
Disallow: /*/*/fork

Disallow: /*/*/watchers
Disallow: /*/*/stargazers
Disallow: /*/*/forks

Disallow: /*/*/activity
Disallow: /*/*/projects
Disallow: /*/*/commits/
Disallow: /*/*/branches
Disallow: /*/*/tags
Disallow: /*/*/compare
Disallow: /*/*/lastcommit/*

Disallow: /*/*/issues/new
Disallow: /*/*/issues/?*
Disallow: /*/*/issues?*
Disallow: /*/*/pulls/?*
Disallow: /*/*/pulls?*
Disallow: /*/*/pulls/*/files

Disallow: /*/tree/
Disallow: /*/download
Disallow: /*/revisions
Disallow: /*/commits/*?author
Disallow: /*/commits/*?path
Disallow: /*/comments
Disallow: /*/blame/
Disallow: /*/raw/
Disallow: /*/cache/
Disallow: /.git/
Disallow: */.git/
Disallow: /*.git
Disallow: /*.atom
Disallow: /*.rss

Disallow: /*/*/archive/
Disallow: *.bundle
Disallow: */commit/*.patch
Disallow: */commit/*.diff

Disallow: /*lang=*
Disallow: /*source=*
Disallow: /*ref_cta=*
Disallow: /*plan=*
Disallow: /*return_to=*
Disallow: /*ref_loc=*
Disallow: /*setup_organization=*
Disallow: /*source_repo=*
Disallow: /*ref_page=*
Disallow: /*source=*
Disallow: /*referrer=*
Disallow: /*report=*
Disallow: /*author=*
Disallow: /*since=*
Disallow: /*until=*
Disallow: /*commits?author=*
Disallow: /*tab=*
Disallow: /*q=*
Disallow: /*repo-search-archived=*

Crawl-delay: 2

# I opt out of online advertising so malware that injects ads on my site won't
# get paid. You should do the same. my ads.txt file contains a standard
# placeholder to forbid any compliant ad networks from paying for ad placement
# on my domain.
User-Agent: Adsbot
User-Agent: AdsBot-Google
User-Agent: AdsBot-Google-Mobile
Disallow: /
Allow: /ads.txt
Allow: /app-ads.txt

# Enabling our crawler to access your site offers several significant benefits
# to you as a publisher. By allowing us access, you enable the maximum number
# of advertisers to confidently purchase advertising space on your pages. Our
# comprehensive data insights help advertisers understand the suitability and
# context of your content, ensuring that their ads align with your audience's
# interests and needs. This alignment leads to improved user experiences,
# increased engagement, and ultimately, higher revenue potential for your
# publication. (https://www.peer39.com/crawler-notice)
#  --> fuck off.
User-agent: peer39_crawler
User-Agent: peer39_crawler/1.0
Disallow: /

## IP-violation scanners ##


# The next three are borrowed from https://www.videolan.org/robots.txt

# > This robot collects content from the Internet for the sole purpose of #
# helping educational institutions prevent plagiarism. [...] we compare student
# papers against the content we find on the Internet to see if we # can find
# similarities. (http://www.turnitin.com/robot/crawlerinfo.html)
#  --> fuck off.
User-Agent: TurnitinBot
Disallow: /

# > NameProtect engages in crawling activity in search of a wide range of brand
# and other intellectual property violations that may be of interest to our
# clients. (http://www.nameprotect.com/botinfo.html)
#  --> fuck off.
User-Agent: NPBot
Disallow: /

# iThenticate is a new service we have developed to combat the piracy of
# intellectual property and ensure the originality of written work for#
# publishers, non-profit agencies, corporations, and newspapers.
# (http://www.slysearch.com/)
#  --> fuck off.
User-Agent: SlySearch
Disallow: /

# BLEXBot assists internet marketers to get information on the link structure
# of sites and their interlinking on the web, to avoid any technical and
# possible legal issues and improve overall online experience.
# (http://webmeup-crawler.com/)
# --> fuck off.
User-Agent: BLEXBot
Disallow: /

# Providing Intellectual Property professionals with superior brand protection
# services by artfully merging the latest technology with expert analysis.
# (https://www.checkmarknetwork.com/spider.html/)
# "The Internet is just way to big to effectively police alone." (ACTUAL quote)
# --> fuck off.
User-agent: CheckMarkNetwork/1.0 (+https://www.checkmarknetwork.com/spider.html)
Disallow: /

# Stop trademark violations and affiliate non-compliance in paid search.
# Automatically monitor your partner and affiliates’ online marketing to
# protect yourself from harmful brand violations and regulatory risks. We
# regularly crawl websites on behalf of our clients to ensure content
# compliance with brand and regulatory guidelines.
# (https://www.brandverity.com/why-is-brandverity-visiting-me)
# --> fuck off.
User-agent: BrandVerity/1.0
Disallow: /

## Misc. icky stuff ##


# Pipl assembles online identity information from multiple independent sources
# to create the most complete picture of a digital identity and connect it to
# real people and their offline identity records. When all the fragments of
# online identity data are collected, connected, and corroborated, the result
# is a more trustworthy identity.
# --> fuck off.
User-agent: PiplBot
Disallow: /

# Well-known overly-aggressive bot that claims to respect robots.txt: http://mj12bot.com/
User-agent: MJ12bot
Crawl-Delay: 10

## Gen-AI data scrapers ##

# Eat shit, OpenAI.
User-agent: ChatGPT-User
User-agent: GPTBot
Disallow: /

# There isn't any public documentation for this AFAICT.
# Reuters thinks this works so I might as well give it a shot.
User-agent: anthropic-ai
User-agent: Claude-Web
Disallow: /

# Extremely aggressive crawling with no documentation. people had to email the
# company about this for robots.txt guidance.
User-agent: ClaudeBot
Disallow: /

# Official way to opt-out of Google's generative AI training:
# <https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers>
User-agent: Google-Extended
Disallow: /

# FacebookBot crawls public web pages to improve language models for our speech
# recognition technology.
# <https://developers.facebook.com/docs/sharing/bot/?_fb_noscript=1>
# UPDATE: The Meta-ExternalAgent crawler crawls the web for use cases such as training AI models or improving products by indexing content directly.
# <https://developers.facebook.com/docs/sharing/webmasters/web-crawlers>
User-Agent: FacebookBot
User-Agent: meta-externalagent
Disallow: /

# This one doesn't support robots.txt: https://www.allenai.org/crawler
# block it with your reverse-proxy or WAF or something.

# See <https://ds.rois.ac.jp/center8/crawler/>
# Parent page says it builds LLMs in the infographic: <https://ds.rois.ac.jp/center8/>
User-agent: Cotoyogi 
Disallow: /

# https://webz.io/bot.html
User-agent: Webzio-extended
Disallow: /

# Other AI/hostile shit
User-agent: img2dataset
User-agent: Omgili
User-agent: Omgilibot
User-agent: Timpibot
User-agent: VelenPublicWebCrawler
User-agent: facebookexternalhit
User-agent: ICC-Crawler
User-agent: ImagesiftBot
User-agent: PetalBot
User-agent: Scrapy
User-agent: Bytespider
User-agent: Amazonbot
User-agent: Diffbot
User-agent: FriendlyCrawler
User-agent: OAI-SearchBot
User-agent: Applebot-Extended
Disallow: /