|
||
---|---|---|
examples | ||
src | ||
.eslintrc.json | ||
.gitignore | ||
.tool-versions | ||
LICENSE | ||
README.md | ||
jest.config.js | ||
package-lock.json | ||
package.json | ||
tsconfig.json |
README.md
crawl-me-not 🚫🤖
A lightweight, framework-agnostic library to detect and block AI crawlers and SEO crawlers from any web server or framework.
Features
- 🚫 Block AI Crawlers: Detect 43+ AI training bots like GPTBot, ChatGPT-User, Claude-Web, and more
- 🔍 Optional SEO Blocking: Also detect SEO crawlers when needed
- 🎯 Framework Agnostic: Works with Express, SvelteKit, Next.js, Fastify, vanilla Node.js, and more
- 🛠️ Highly Configurable: Custom patterns, whitelists, response messages, and headers
- 📝 TypeScript: Full TypeScript support with detailed type definitions
- 🪶 Zero Dependencies: Lightweight with no external dependencies
- 🧪 Well Tested: Comprehensive test coverage
Installation
npm install crawl-me-not
Quick Start
import { shouldBlockCrawler, extractUserAgent } from 'crawl-me-not';
// Basic usage
const userAgent = extractUserAgent(request.headers);
const result = shouldBlockCrawler(userAgent);
if (result.isBlocked) {
// Send 403 response
return new Response('Access denied', { status: 403 });
}
// Continue with normal request handling
Configuration Options
interface CrawlerConfig {
blockAI?: boolean; // Block AI crawlers (default: true)
blockSEO?: boolean; // Block SEO crawlers (default: false)
message?: string; // Custom response message (default: "Access denied")
statusCode?: number; // HTTP status code (default: 403)
customBlocked?: (string | RegExp)[]; // Additional patterns to block
whitelist?: (string | RegExp)[]; // Patterns to always allow
headers?: Record<string, string>; // Custom response headers
debug?: boolean; // Enable debug logging (default: false)
}
Framework Examples
Express
import express from 'express';
import { shouldBlockCrawler, extractUserAgent } from 'crawl-me-not';
const app = express();
app.use((req, res, next) => {
const userAgent = extractUserAgent(req.headers);
const result = shouldBlockCrawler(userAgent, {
blockAI: true,
blockSEO: false,
debug: true
});
if (result.isBlocked) {
return res.status(403).json({
error: 'Access denied',
reason: `${result.crawlerType} crawler detected`,
userAgent: result.userAgent
});
}
next();
});
SvelteKit
// src/hooks.server.ts
import { shouldBlockCrawler, extractUserAgent } from 'crawl-me-not';
import type { Handle } from '@sveltejs/kit';
export const handle: Handle = async ({ event, resolve }) => {
const userAgent = extractUserAgent(event.request.headers);
const result = shouldBlockCrawler(userAgent);
if (result.isBlocked) {
return new Response('Access denied', {
status: 403,
headers: { 'X-Blocked-Reason': 'AI crawler detected' }
});
}
return resolve(event);
};
Next.js (App Router)
// middleware.ts
import { shouldBlockCrawler, extractUserAgent } from 'crawl-me-not';
import { NextRequest, NextResponse } from 'next/server';
export function middleware(request: NextRequest) {
const userAgent = extractUserAgent(request.headers);
const result = shouldBlockCrawler(userAgent);
if (result.isBlocked) {
return NextResponse.json(
{ error: 'Access denied' },
{ status: 403 }
);
}
return NextResponse.next();
}
Next.js (Pages Router)
// pages/api/[...all].ts
import { shouldBlockCrawler, extractUserAgent } from 'crawl-me-not';
import type { NextApiRequest, NextApiResponse } from 'next';
export default function handler(req: NextApiRequest, res: NextApiResponse) {
const userAgent = extractUserAgent(req.headers);
const result = shouldBlockCrawler(userAgent);
if (result.isBlocked) {
return res.status(403).json({ error: 'Access denied' });
}
// Continue with your API logic
res.status(200).json({ message: 'Hello World' });
}
Fastify
import Fastify from 'fastify';
import { shouldBlockCrawler, extractUserAgent } from 'crawl-me-not';
const fastify = Fastify();
fastify.addHook('preHandler', async (request, reply) => {
const userAgent = extractUserAgent(request.headers);
const result = shouldBlockCrawler(userAgent);
if (result.isBlocked) {
reply.status(403).send({ error: 'Access denied' });
return;
}
});
Vanilla Node.js
import http from 'node:http';
import { shouldBlockCrawler, extractUserAgent } from 'crawl-me-not';
const server = http.createServer((req, res) => {
const userAgent = extractUserAgent(req.headers);
const result = shouldBlockCrawler(userAgent);
if (result.isBlocked) {
res.statusCode = 403;
res.setHeader('Content-Type', 'application/json');
res.end(JSON.stringify({ error: 'Access denied' }));
return;
}
// Your normal request handling
res.statusCode = 200;
res.end('Hello World!');
});
Bun
import { shouldBlockCrawler, extractUserAgent } from 'crawl-me-not';
Bun.serve({
fetch(request) {
const userAgent = extractUserAgent(request.headers);
const result = shouldBlockCrawler(userAgent);
if (result.isBlocked) {
return new Response('Access denied', { status: 403 });
}
return new Response('Hello World!');
},
});
Advanced Usage
Custom Configuration
const result = shouldBlockCrawler(userAgent, {
blockAI: true,
blockSEO: false,
customBlocked: [
/badbot/i, // Block anything with "badbot"
'unwanted-crawler', // Block exact string match
/scraper.*v[0-9]/i // Block scraper versions
],
whitelist: [
/goodbot/i, // Always allow "goodbot"
'monitoring-service' // Always allow this service
],
message: 'Custom blocking message',
statusCode: 429,
headers: {
'X-Blocked-Reason': 'Automated traffic detected',
'Retry-After': '3600'
},
debug: true
});
Manual Detection (Non-blocking)
import { shouldBlockCrawler, extractUserAgent } from 'crawl-me-not';
// Just detect, don't block
const userAgent = extractUserAgent(request.headers);
const result = shouldBlockCrawler(userAgent, { blockAI: false, blockSEO: false });
// Log crawler activity
if (result.crawlerType) {
console.log(`Detected ${result.crawlerType} crawler:`, result.userAgent);
}
// Apply custom logic
if (result.crawlerType === 'ai' && isRateLimited(request)) {
return blockResponse();
}
Rate Limiting for Crawlers
const crawlerLimits = new Map();
app.use((req, res, next) => {
const userAgent = extractUserAgent(req.headers);
const result = shouldBlockCrawler(userAgent, { blockAI: false });
if (result.crawlerType === 'ai') {
const ip = req.ip;
const now = Date.now();
const limit = crawlerLimits.get(ip) || { count: 0, resetTime: now + 60000 };
if (now > limit.resetTime) {
limit.count = 0;
limit.resetTime = now + 60000;
}
limit.count++;
crawlerLimits.set(ip, limit);
if (limit.count > 10) {
return res.status(429).json({ error: 'Rate limit exceeded' });
}
}
next();
});
Known Crawlers
AI Crawlers (Detected by default)
- OpenAI: GPTBot, ChatGPT-User
- Google AI: Google-Extended, GoogleOther
- Anthropic: Claude-Web, ClaudeBot
- Meta/Facebook: FacebookBot, Meta-ExternalAgent
- ByteDance: Bytespider, ByteDance
- Others: CCBot, PerplexityBot, YouBot, AI2Bot, cohere-ai
- Generic patterns: python-requests, curl, wget, scrapy, etc.
SEO Crawlers (Detected but allowed by default)
- Search Engines: Googlebot, Bingbot, YandexBot, Baiduspider, DuckDuckBot
- SEO Tools: AhrefsBot, SemrushBot, MJ12bot, DotBot
- Social Media: facebookexternalhit, Twitterbot, LinkedInBot, WhatsApp
API Reference
shouldBlockCrawler(userAgent: string, config?: CrawlerConfig): CrawlerDetectionResult
Main function to check if a user agent should be blocked.
Returns:
interface CrawlerDetectionResult {
isBlocked: boolean; // Whether the crawler should be blocked
crawlerType: 'ai' | 'seo' | 'custom' | null; // Type of crawler detected
userAgent: string; // The original user agent string
matchedPattern?: string | RegExp; // Pattern that matched (if blocked)
}
extractUserAgent(headers: HeadersLike): string
Utility function to extract user agent from various header formats.
Supports:
- Express-style headers:
{ 'user-agent': 'string' }
- Web API Headers:
headers.get('user-agent')
- Node.js IncomingMessage:
req.headers['user-agent']
detectCrawlerType(userAgent: string): 'ai' | 'seo' | null
Detect what type of crawler the user agent represents without blocking logic.
Constants
AI_CRAWLER_PATTERNS
: Array of patterns for AI crawlersSEO_CRAWLER_PATTERNS
: Array of patterns for SEO crawlersDEFAULT_CONFIG
: Default configuration object
Contributing
Contributions are welcome! Please feel free to submit issues and pull requests.
License
MIT © [Your Name]
Changelog
1.0.0
- Initial release
- Framework-agnostic design
- Comprehensive AI crawler detection
- Optional SEO crawler detection
- TypeScript support
- Zero dependencies