355 lines
9.1 KiB
Markdown
355 lines
9.1 KiB
Markdown
# crawl-me-not 🚫🤖
|
|
|
|
A lightweight, framework-agnostic library to detect and block AI crawlers and SEO crawlers from any web server or framework.
|
|
|
|
## Features
|
|
|
|
- 🚫 **Block AI Crawlers**: Detect 43+ AI training bots like GPTBot, ChatGPT-User, Claude-Web, and more
|
|
- 🔍 **Optional SEO Blocking**: Also detect SEO crawlers when needed
|
|
- 🎯 **Framework Agnostic**: Works with Express, SvelteKit, Next.js, Fastify, vanilla Node.js, and more
|
|
- 🛠️ **Highly Configurable**: Custom patterns, whitelists, response messages, and headers
|
|
- 📝 **TypeScript**: Full TypeScript support with detailed type definitions
|
|
- 🪶 **Zero Dependencies**: Lightweight with no external dependencies
|
|
- 🧪 **Well Tested**: Comprehensive test coverage
|
|
|
|
## Installation
|
|
|
|
```bash
|
|
npm install crawl-me-not
|
|
```
|
|
|
|
## Quick Start
|
|
|
|
```typescript
|
|
import { shouldBlockCrawler, extractUserAgent } from 'crawl-me-not';
|
|
|
|
// Basic usage
|
|
const userAgent = extractUserAgent(request.headers);
|
|
const result = shouldBlockCrawler(userAgent);
|
|
|
|
if (result.isBlocked) {
|
|
// Send 403 response
|
|
return new Response('Access denied', { status: 403 });
|
|
}
|
|
|
|
// Continue with normal request handling
|
|
```
|
|
|
|
## Configuration Options
|
|
|
|
```typescript
|
|
interface CrawlerConfig {
|
|
blockAI?: boolean; // Block AI crawlers (default: true)
|
|
blockSEO?: boolean; // Block SEO crawlers (default: false)
|
|
message?: string; // Custom response message (default: "Access denied")
|
|
statusCode?: number; // HTTP status code (default: 403)
|
|
customBlocked?: (string | RegExp)[]; // Additional patterns to block
|
|
whitelist?: (string | RegExp)[]; // Patterns to always allow
|
|
headers?: Record<string, string>; // Custom response headers
|
|
debug?: boolean; // Enable debug logging (default: false)
|
|
}
|
|
```
|
|
|
|
## Framework Examples
|
|
|
|
### Express
|
|
|
|
```typescript
|
|
import express from 'express';
|
|
import { shouldBlockCrawler, extractUserAgent } from 'crawl-me-not';
|
|
|
|
const app = express();
|
|
|
|
app.use((req, res, next) => {
|
|
const userAgent = extractUserAgent(req.headers);
|
|
const result = shouldBlockCrawler(userAgent, {
|
|
blockAI: true,
|
|
blockSEO: false,
|
|
debug: true
|
|
});
|
|
|
|
if (result.isBlocked) {
|
|
return res.status(403).json({
|
|
error: 'Access denied',
|
|
reason: `${result.crawlerType} crawler detected`,
|
|
userAgent: result.userAgent
|
|
});
|
|
}
|
|
|
|
next();
|
|
});
|
|
```
|
|
|
|
### SvelteKit
|
|
|
|
```typescript
|
|
// src/hooks.server.ts
|
|
import { shouldBlockCrawler, extractUserAgent } from 'crawl-me-not';
|
|
import type { Handle } from '@sveltejs/kit';
|
|
|
|
export const handle: Handle = async ({ event, resolve }) => {
|
|
const userAgent = extractUserAgent(event.request.headers);
|
|
const result = shouldBlockCrawler(userAgent);
|
|
|
|
if (result.isBlocked) {
|
|
return new Response('Access denied', {
|
|
status: 403,
|
|
headers: { 'X-Blocked-Reason': 'AI crawler detected' }
|
|
});
|
|
}
|
|
|
|
return resolve(event);
|
|
};
|
|
```
|
|
|
|
### Next.js (App Router)
|
|
|
|
```typescript
|
|
// middleware.ts
|
|
import { shouldBlockCrawler, extractUserAgent } from 'crawl-me-not';
|
|
import { NextRequest, NextResponse } from 'next/server';
|
|
|
|
export function middleware(request: NextRequest) {
|
|
const userAgent = extractUserAgent(request.headers);
|
|
const result = shouldBlockCrawler(userAgent);
|
|
|
|
if (result.isBlocked) {
|
|
return NextResponse.json(
|
|
{ error: 'Access denied' },
|
|
{ status: 403 }
|
|
);
|
|
}
|
|
|
|
return NextResponse.next();
|
|
}
|
|
```
|
|
|
|
### Next.js (Pages Router)
|
|
|
|
```typescript
|
|
// pages/api/[...all].ts
|
|
import { shouldBlockCrawler, extractUserAgent } from 'crawl-me-not';
|
|
import type { NextApiRequest, NextApiResponse } from 'next';
|
|
|
|
export default function handler(req: NextApiRequest, res: NextApiResponse) {
|
|
const userAgent = extractUserAgent(req.headers);
|
|
const result = shouldBlockCrawler(userAgent);
|
|
|
|
if (result.isBlocked) {
|
|
return res.status(403).json({ error: 'Access denied' });
|
|
}
|
|
|
|
// Continue with your API logic
|
|
res.status(200).json({ message: 'Hello World' });
|
|
}
|
|
```
|
|
|
|
### Fastify
|
|
|
|
```typescript
|
|
import Fastify from 'fastify';
|
|
import { shouldBlockCrawler, extractUserAgent } from 'crawl-me-not';
|
|
|
|
const fastify = Fastify();
|
|
|
|
fastify.addHook('preHandler', async (request, reply) => {
|
|
const userAgent = extractUserAgent(request.headers);
|
|
const result = shouldBlockCrawler(userAgent);
|
|
|
|
if (result.isBlocked) {
|
|
reply.status(403).send({ error: 'Access denied' });
|
|
return;
|
|
}
|
|
});
|
|
```
|
|
|
|
### Vanilla Node.js
|
|
|
|
```typescript
|
|
import http from 'node:http';
|
|
import { shouldBlockCrawler, extractUserAgent } from 'crawl-me-not';
|
|
|
|
const server = http.createServer((req, res) => {
|
|
const userAgent = extractUserAgent(req.headers);
|
|
const result = shouldBlockCrawler(userAgent);
|
|
|
|
if (result.isBlocked) {
|
|
res.statusCode = 403;
|
|
res.setHeader('Content-Type', 'application/json');
|
|
res.end(JSON.stringify({ error: 'Access denied' }));
|
|
return;
|
|
}
|
|
|
|
// Your normal request handling
|
|
res.statusCode = 200;
|
|
res.end('Hello World!');
|
|
});
|
|
```
|
|
|
|
### Bun
|
|
|
|
```typescript
|
|
import { shouldBlockCrawler, extractUserAgent } from 'crawl-me-not';
|
|
|
|
Bun.serve({
|
|
fetch(request) {
|
|
const userAgent = extractUserAgent(request.headers);
|
|
const result = shouldBlockCrawler(userAgent);
|
|
|
|
if (result.isBlocked) {
|
|
return new Response('Access denied', { status: 403 });
|
|
}
|
|
|
|
return new Response('Hello World!');
|
|
},
|
|
});
|
|
```
|
|
|
|
## Advanced Usage
|
|
|
|
### Custom Configuration
|
|
|
|
```typescript
|
|
const result = shouldBlockCrawler(userAgent, {
|
|
blockAI: true,
|
|
blockSEO: false,
|
|
customBlocked: [
|
|
/badbot/i, // Block anything with "badbot"
|
|
'unwanted-crawler', // Block exact string match
|
|
/scraper.*v[0-9]/i // Block scraper versions
|
|
],
|
|
whitelist: [
|
|
/goodbot/i, // Always allow "goodbot"
|
|
'monitoring-service' // Always allow this service
|
|
],
|
|
message: 'Custom blocking message',
|
|
statusCode: 429,
|
|
headers: {
|
|
'X-Blocked-Reason': 'Automated traffic detected',
|
|
'Retry-After': '3600'
|
|
},
|
|
debug: true
|
|
});
|
|
```
|
|
|
|
### Manual Detection (Non-blocking)
|
|
|
|
```typescript
|
|
import { shouldBlockCrawler, extractUserAgent } from 'crawl-me-not';
|
|
|
|
// Just detect, don't block
|
|
const userAgent = extractUserAgent(request.headers);
|
|
const result = shouldBlockCrawler(userAgent, { blockAI: false, blockSEO: false });
|
|
|
|
// Log crawler activity
|
|
if (result.crawlerType) {
|
|
console.log(`Detected ${result.crawlerType} crawler:`, result.userAgent);
|
|
}
|
|
|
|
// Apply custom logic
|
|
if (result.crawlerType === 'ai' && isRateLimited(request)) {
|
|
return blockResponse();
|
|
}
|
|
```
|
|
|
|
### Rate Limiting for Crawlers
|
|
|
|
```typescript
|
|
const crawlerLimits = new Map();
|
|
|
|
app.use((req, res, next) => {
|
|
const userAgent = extractUserAgent(req.headers);
|
|
const result = shouldBlockCrawler(userAgent, { blockAI: false });
|
|
|
|
if (result.crawlerType === 'ai') {
|
|
const ip = req.ip;
|
|
const now = Date.now();
|
|
const limit = crawlerLimits.get(ip) || { count: 0, resetTime: now + 60000 };
|
|
|
|
if (now > limit.resetTime) {
|
|
limit.count = 0;
|
|
limit.resetTime = now + 60000;
|
|
}
|
|
|
|
limit.count++;
|
|
crawlerLimits.set(ip, limit);
|
|
|
|
if (limit.count > 10) {
|
|
return res.status(429).json({ error: 'Rate limit exceeded' });
|
|
}
|
|
}
|
|
|
|
next();
|
|
});
|
|
```
|
|
|
|
## Known Crawlers
|
|
|
|
### AI Crawlers (Detected by default)
|
|
|
|
- **OpenAI**: GPTBot, ChatGPT-User
|
|
- **Google AI**: Google-Extended, GoogleOther
|
|
- **Anthropic**: Claude-Web, ClaudeBot
|
|
- **Meta/Facebook**: FacebookBot, Meta-ExternalAgent
|
|
- **ByteDance**: Bytespider, ByteDance
|
|
- **Others**: CCBot, PerplexityBot, YouBot, AI2Bot, cohere-ai
|
|
- **Generic patterns**: python-requests, curl, wget, scrapy, etc.
|
|
|
|
### SEO Crawlers (Detected but allowed by default)
|
|
|
|
- **Search Engines**: Googlebot, Bingbot, YandexBot, Baiduspider, DuckDuckBot
|
|
- **SEO Tools**: AhrefsBot, SemrushBot, MJ12bot, DotBot
|
|
- **Social Media**: facebookexternalhit, Twitterbot, LinkedInBot, WhatsApp
|
|
|
|
## API Reference
|
|
|
|
### `shouldBlockCrawler(userAgent: string, config?: CrawlerConfig): CrawlerDetectionResult`
|
|
|
|
Main function to check if a user agent should be blocked.
|
|
|
|
**Returns:**
|
|
```typescript
|
|
interface CrawlerDetectionResult {
|
|
isBlocked: boolean; // Whether the crawler should be blocked
|
|
crawlerType: 'ai' | 'seo' | 'custom' | null; // Type of crawler detected
|
|
userAgent: string; // The original user agent string
|
|
matchedPattern?: string | RegExp; // Pattern that matched (if blocked)
|
|
}
|
|
```
|
|
|
|
### `extractUserAgent(headers: HeadersLike): string`
|
|
|
|
Utility function to extract user agent from various header formats.
|
|
|
|
**Supports:**
|
|
- Express-style headers: `{ 'user-agent': 'string' }`
|
|
- Web API Headers: `headers.get('user-agent')`
|
|
- Node.js IncomingMessage: `req.headers['user-agent']`
|
|
|
|
### `detectCrawlerType(userAgent: string): 'ai' | 'seo' | null`
|
|
|
|
Detect what type of crawler the user agent represents without blocking logic.
|
|
|
|
### Constants
|
|
|
|
- `AI_CRAWLER_PATTERNS`: Array of patterns for AI crawlers
|
|
- `SEO_CRAWLER_PATTERNS`: Array of patterns for SEO crawlers
|
|
- `DEFAULT_CONFIG`: Default configuration object
|
|
|
|
## Contributing
|
|
|
|
Contributions are welcome! Please feel free to submit issues and pull requests.
|
|
|
|
## License
|
|
|
|
MIT © [Your Name]
|
|
|
|
## Changelog
|
|
|
|
### 1.0.0
|
|
- Initial release
|
|
- Framework-agnostic design
|
|
- Comprehensive AI crawler detection
|
|
- Optional SEO crawler detection
|
|
- TypeScript support
|
|
- Zero dependencies |