User-agent: * Disallow: /noindex/ Disallow: /misc/ Disallow: /~strawberry/ Disallow: .git User-agent: * # git.girlcock.ceo stuff # from https://git.gay/gitgay/assets/src/branch/main/public/robots.txt Disallow: /api/* Disallow: /avatars Disallow: /user/* Disallow: /*/*/src/commit/* Disallow: /*/*/commit/* Disallow: /*/*/*/refs/* Disallow: /*/*/*/star Disallow: /*/*/*/watch Disallow: /*/*/labels Disallow: /*/*/activity/* Disallow: /vendor/* Disallow: /swagger.*.json Disallow: /explore/*?* Disallow: /repo/create Disallow: /repo/migrate Disallow: /org/create Disallow: /*/*/fork Disallow: /*/*/watchers Disallow: /*/*/stargazers Disallow: /*/*/forks Disallow: /*/*/activity Disallow: /*/*/projects Disallow: /*/*/commits/ Disallow: /*/*/branches Disallow: /*/*/tags Disallow: /*/*/compare Disallow: /*/*/lastcommit/* Disallow: /*/*/issues/new Disallow: /*/*/issues/?* Disallow: /*/*/issues?* Disallow: /*/*/pulls/?* Disallow: /*/*/pulls?* Disallow: /*/*/pulls/*/files Disallow: /*/tree/ Disallow: /*/download Disallow: /*/revisions Disallow: /*/commits/*?author Disallow: /*/commits/*?path Disallow: /*/comments Disallow: /*/blame/ Disallow: /*/raw/ Disallow: /*/cache/ Disallow: /.git/ Disallow: */.git/ Disallow: /*.git Disallow: /*.atom Disallow: /*.rss Disallow: /*/*/archive/ Disallow: *.bundle Disallow: */commit/*.patch Disallow: */commit/*.diff Disallow: /*lang=* Disallow: /*source=* Disallow: /*ref_cta=* Disallow: /*plan=* Disallow: /*return_to=* Disallow: /*ref_loc=* Disallow: /*setup_organization=* Disallow: /*source_repo=* Disallow: /*ref_page=* Disallow: /*source=* Disallow: /*referrer=* Disallow: /*report=* Disallow: /*author=* Disallow: /*since=* Disallow: /*until=* Disallow: /*commits?author=* Disallow: /*tab=* Disallow: /*q=* Disallow: /*repo-search-archived=* Crawl-delay: 2 # I opt out of online advertising so malware that injects ads on my site won't # get paid. You should do the same. my ads.txt file contains a standard # placeholder to forbid any compliant ad networks from paying for ad placement # on my domain. User-Agent: Adsbot User-Agent: AdsBot-Google User-Agent: AdsBot-Google-Mobile Disallow: / Allow: /ads.txt Allow: /app-ads.txt # Enabling our crawler to access your site offers several significant benefits # to you as a publisher. By allowing us access, you enable the maximum number # of advertisers to confidently purchase advertising space on your pages. Our # comprehensive data insights help advertisers understand the suitability and # context of your content, ensuring that their ads align with your audience's # interests and needs. This alignment leads to improved user experiences, # increased engagement, and ultimately, higher revenue potential for your # publication. (https://www.peer39.com/crawler-notice) # --> fuck off. User-agent: peer39_crawler User-Agent: peer39_crawler/1.0 Disallow: / ## IP-violation scanners ## # The next three are borrowed from https://www.videolan.org/robots.txt # > This robot collects content from the Internet for the sole purpose of # # helping educational institutions prevent plagiarism. [...] we compare student # papers against the content we find on the Internet to see if we # can find # similarities. (http://www.turnitin.com/robot/crawlerinfo.html) # --> fuck off. User-Agent: TurnitinBot Disallow: / # > NameProtect engages in crawling activity in search of a wide range of brand # and other intellectual property violations that may be of interest to our # clients. (http://www.nameprotect.com/botinfo.html) # --> fuck off. User-Agent: NPBot Disallow: / # iThenticate is a new service we have developed to combat the piracy of # intellectual property and ensure the originality of written work for# # publishers, non-profit agencies, corporations, and newspapers. # (http://www.slysearch.com/) # --> fuck off. User-Agent: SlySearch Disallow: / # BLEXBot assists internet marketers to get information on the link structure # of sites and their interlinking on the web, to avoid any technical and # possible legal issues and improve overall online experience. # (http://webmeup-crawler.com/) # --> fuck off. User-Agent: BLEXBot Disallow: / # Providing Intellectual Property professionals with superior brand protection # services by artfully merging the latest technology with expert analysis. # (https://www.checkmarknetwork.com/spider.html/) # "The Internet is just way to big to effectively police alone." (ACTUAL quote) # --> fuck off. User-agent: CheckMarkNetwork/1.0 (+https://www.checkmarknetwork.com/spider.html) Disallow: / # Stop trademark violations and affiliate non-compliance in paid search. # Automatically monitor your partner and affiliates’ online marketing to # protect yourself from harmful brand violations and regulatory risks. We # regularly crawl websites on behalf of our clients to ensure content # compliance with brand and regulatory guidelines. # (https://www.brandverity.com/why-is-brandverity-visiting-me) # --> fuck off. User-agent: BrandVerity/1.0 Disallow: / ## Misc. icky stuff ## # Pipl assembles online identity information from multiple independent sources # to create the most complete picture of a digital identity and connect it to # real people and their offline identity records. When all the fragments of # online identity data are collected, connected, and corroborated, the result # is a more trustworthy identity. # --> fuck off. User-agent: PiplBot Disallow: / # Well-known overly-aggressive bot that claims to respect robots.txt: http://mj12bot.com/ User-agent: MJ12bot Crawl-Delay: 10 ## Gen-AI data scrapers ## # Eat shit, OpenAI. User-agent: ChatGPT-User User-agent: GPTBot Disallow: / # There isn't any public documentation for this AFAICT. # Reuters thinks this works so I might as well give it a shot. User-agent: anthropic-ai User-agent: Claude-Web Disallow: / # Extremely aggressive crawling with no documentation. people had to email the # company about this for robots.txt guidance. User-agent: ClaudeBot Disallow: / # Official way to opt-out of Google's generative AI training: # User-agent: Google-Extended Disallow: / # FacebookBot crawls public web pages to improve language models for our speech # recognition technology. # # UPDATE: The Meta-ExternalAgent crawler crawls the web for use cases such as training AI models or improving products by indexing content directly. # User-Agent: FacebookBot User-Agent: meta-externalagent Disallow: / # This one doesn't support robots.txt: https://www.allenai.org/crawler # block it with your reverse-proxy or WAF or something. # See # Parent page says it builds LLMs in the infographic: User-agent: Cotoyogi Disallow: / # https://webz.io/bot.html User-agent: Webzio-extended Disallow: / # Other AI/hostile shit User-agent: img2dataset User-agent: Omgili User-agent: Omgilibot User-agent: Timpibot User-agent: VelenPublicWebCrawler User-agent: facebookexternalhit User-agent: ICC-Crawler User-agent: ImagesiftBot User-agent: PetalBot User-agent: Scrapy User-agent: Bytespider User-agent: Amazonbot User-agent: Diffbot User-agent: FriendlyCrawler User-agent: OAI-SearchBot User-agent: Applebot-Extended Disallow: /