init commit

This commit is contained in:
Silas 2025-05-31 12:58:55 -04:00
commit d13ea65209
Signed by: silentsilas
GPG Key ID: 4199EFB7DAA34349
18 changed files with 6436 additions and 0 deletions

23
.eslintrc.json Normal file
View File

@ -0,0 +1,23 @@
{
"env": {
"browser": true,
"es2021": true,
"node": true,
"jest": true
},
"extends": [
"eslint:recommended"
],
"parser": "@typescript-eslint/parser",
"parserOptions": {
"ecmaVersion": "latest",
"sourceType": "module"
},
"plugins": ["@typescript-eslint"],
"rules": {
"@typescript-eslint/no-unused-vars": "error",
"prefer-const": "error",
"no-var": "error"
},
"ignorePatterns": ["dist/", "node_modules/", "*.js", "examples/", "**/*.test.ts", "**/*.spec.ts"]
}

59
.gitignore vendored Normal file
View File

@ -0,0 +1,59 @@
# Dependencies
node_modules/
npm-debug.log*
yarn-debug.log*
yarn-error.log*
# Build output
dist/
build/
*.tsbuildinfo
# Coverage directory used by tools like istanbul
coverage/
*.lcov
# Runtime data
pids
*.pid
*.seed
*.pid.lock
# Logs
logs
*.log
# Environment variables
.env
.env.local
.env.development.local
.env.test.local
.env.production.local
# Editor directories and files
.vscode/
.idea/
*.swp
*.swo
*~
# OS generated files
.DS_Store
.DS_Store?
._*
.Spotlight-V100
.Trashes
ehthumbs.db
Thumbs.db
# Temporary folders
tmp/
temp/
# Testing
.nyc_output
# Package manager lock files (uncomment if you want to include them)
# package-lock.json
# yarn.lock
# pnpm-lock.yaml

1
.tool-versions Normal file
View File

@ -0,0 +1 @@
nodejs 22.16.0

21
LICENSE Normal file
View File

@ -0,0 +1,21 @@
MIT License
Copyright (c) 2024 crawl-me-not contributors
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

355
README.md Normal file
View File

@ -0,0 +1,355 @@
# crawl-me-not 🚫🤖
A lightweight, framework-agnostic library to detect and block AI crawlers and SEO crawlers from any web server or framework.
## Features
- 🚫 **Block AI Crawlers**: Detect 43+ AI training bots like GPTBot, ChatGPT-User, Claude-Web, and more
- 🔍 **Optional SEO Blocking**: Also detect SEO crawlers when needed
- 🎯 **Framework Agnostic**: Works with Express, SvelteKit, Next.js, Fastify, vanilla Node.js, and more
- 🛠️ **Highly Configurable**: Custom patterns, whitelists, response messages, and headers
- 📝 **TypeScript**: Full TypeScript support with detailed type definitions
- 🪶 **Zero Dependencies**: Lightweight with no external dependencies
- 🧪 **Well Tested**: Comprehensive test coverage
## Installation
```bash
npm install crawl-me-not
```
## Quick Start
```typescript
import { shouldBlockCrawler, extractUserAgent } from 'crawl-me-not';
// Basic usage
const userAgent = extractUserAgent(request.headers);
const result = shouldBlockCrawler(userAgent);
if (result.isBlocked) {
// Send 403 response
return new Response('Access denied', { status: 403 });
}
// Continue with normal request handling
```
## Configuration Options
```typescript
interface CrawlerConfig {
blockAI?: boolean; // Block AI crawlers (default: true)
blockSEO?: boolean; // Block SEO crawlers (default: false)
message?: string; // Custom response message (default: "Access denied")
statusCode?: number; // HTTP status code (default: 403)
customBlocked?: (string | RegExp)[]; // Additional patterns to block
whitelist?: (string | RegExp)[]; // Patterns to always allow
headers?: Record<string, string>; // Custom response headers
debug?: boolean; // Enable debug logging (default: false)
}
```
## Framework Examples
### Express
```typescript
import express from 'express';
import { shouldBlockCrawler, extractUserAgent } from 'crawl-me-not';
const app = express();
app.use((req, res, next) => {
const userAgent = extractUserAgent(req.headers);
const result = shouldBlockCrawler(userAgent, {
blockAI: true,
blockSEO: false,
debug: true
});
if (result.isBlocked) {
return res.status(403).json({
error: 'Access denied',
reason: `${result.crawlerType} crawler detected`,
userAgent: result.userAgent
});
}
next();
});
```
### SvelteKit
```typescript
// src/hooks.server.ts
import { shouldBlockCrawler, extractUserAgent } from 'crawl-me-not';
import type { Handle } from '@sveltejs/kit';
export const handle: Handle = async ({ event, resolve }) => {
const userAgent = extractUserAgent(event.request.headers);
const result = shouldBlockCrawler(userAgent);
if (result.isBlocked) {
return new Response('Access denied', {
status: 403,
headers: { 'X-Blocked-Reason': 'AI crawler detected' }
});
}
return resolve(event);
};
```
### Next.js (App Router)
```typescript
// middleware.ts
import { shouldBlockCrawler, extractUserAgent } from 'crawl-me-not';
import { NextRequest, NextResponse } from 'next/server';
export function middleware(request: NextRequest) {
const userAgent = extractUserAgent(request.headers);
const result = shouldBlockCrawler(userAgent);
if (result.isBlocked) {
return NextResponse.json(
{ error: 'Access denied' },
{ status: 403 }
);
}
return NextResponse.next();
}
```
### Next.js (Pages Router)
```typescript
// pages/api/[...all].ts
import { shouldBlockCrawler, extractUserAgent } from 'crawl-me-not';
import type { NextApiRequest, NextApiResponse } from 'next';
export default function handler(req: NextApiRequest, res: NextApiResponse) {
const userAgent = extractUserAgent(req.headers);
const result = shouldBlockCrawler(userAgent);
if (result.isBlocked) {
return res.status(403).json({ error: 'Access denied' });
}
// Continue with your API logic
res.status(200).json({ message: 'Hello World' });
}
```
### Fastify
```typescript
import Fastify from 'fastify';
import { shouldBlockCrawler, extractUserAgent } from 'crawl-me-not';
const fastify = Fastify();
fastify.addHook('preHandler', async (request, reply) => {
const userAgent = extractUserAgent(request.headers);
const result = shouldBlockCrawler(userAgent);
if (result.isBlocked) {
reply.status(403).send({ error: 'Access denied' });
return;
}
});
```
### Vanilla Node.js
```typescript
import http from 'node:http';
import { shouldBlockCrawler, extractUserAgent } from 'crawl-me-not';
const server = http.createServer((req, res) => {
const userAgent = extractUserAgent(req.headers);
const result = shouldBlockCrawler(userAgent);
if (result.isBlocked) {
res.statusCode = 403;
res.setHeader('Content-Type', 'application/json');
res.end(JSON.stringify({ error: 'Access denied' }));
return;
}
// Your normal request handling
res.statusCode = 200;
res.end('Hello World!');
});
```
### Bun
```typescript
import { shouldBlockCrawler, extractUserAgent } from 'crawl-me-not';
Bun.serve({
fetch(request) {
const userAgent = extractUserAgent(request.headers);
const result = shouldBlockCrawler(userAgent);
if (result.isBlocked) {
return new Response('Access denied', { status: 403 });
}
return new Response('Hello World!');
},
});
```
## Advanced Usage
### Custom Configuration
```typescript
const result = shouldBlockCrawler(userAgent, {
blockAI: true,
blockSEO: false,
customBlocked: [
/badbot/i, // Block anything with "badbot"
'unwanted-crawler', // Block exact string match
/scraper.*v[0-9]/i // Block scraper versions
],
whitelist: [
/goodbot/i, // Always allow "goodbot"
'monitoring-service' // Always allow this service
],
message: 'Custom blocking message',
statusCode: 429,
headers: {
'X-Blocked-Reason': 'Automated traffic detected',
'Retry-After': '3600'
},
debug: true
});
```
### Manual Detection (Non-blocking)
```typescript
import { shouldBlockCrawler, extractUserAgent } from 'crawl-me-not';
// Just detect, don't block
const userAgent = extractUserAgent(request.headers);
const result = shouldBlockCrawler(userAgent, { blockAI: false, blockSEO: false });
// Log crawler activity
if (result.crawlerType) {
console.log(`Detected ${result.crawlerType} crawler:`, result.userAgent);
}
// Apply custom logic
if (result.crawlerType === 'ai' && isRateLimited(request)) {
return blockResponse();
}
```
### Rate Limiting for Crawlers
```typescript
const crawlerLimits = new Map();
app.use((req, res, next) => {
const userAgent = extractUserAgent(req.headers);
const result = shouldBlockCrawler(userAgent, { blockAI: false });
if (result.crawlerType === 'ai') {
const ip = req.ip;
const now = Date.now();
const limit = crawlerLimits.get(ip) || { count: 0, resetTime: now + 60000 };
if (now > limit.resetTime) {
limit.count = 0;
limit.resetTime = now + 60000;
}
limit.count++;
crawlerLimits.set(ip, limit);
if (limit.count > 10) {
return res.status(429).json({ error: 'Rate limit exceeded' });
}
}
next();
});
```
## Known Crawlers
### AI Crawlers (Detected by default)
- **OpenAI**: GPTBot, ChatGPT-User
- **Google AI**: Google-Extended, GoogleOther
- **Anthropic**: Claude-Web, ClaudeBot
- **Meta/Facebook**: FacebookBot, Meta-ExternalAgent
- **ByteDance**: Bytespider, ByteDance
- **Others**: CCBot, PerplexityBot, YouBot, AI2Bot, cohere-ai
- **Generic patterns**: python-requests, curl, wget, scrapy, etc.
### SEO Crawlers (Detected but allowed by default)
- **Search Engines**: Googlebot, Bingbot, YandexBot, Baiduspider, DuckDuckBot
- **SEO Tools**: AhrefsBot, SemrushBot, MJ12bot, DotBot
- **Social Media**: facebookexternalhit, Twitterbot, LinkedInBot, WhatsApp
## API Reference
### `shouldBlockCrawler(userAgent: string, config?: CrawlerConfig): CrawlerDetectionResult`
Main function to check if a user agent should be blocked.
**Returns:**
```typescript
interface CrawlerDetectionResult {
isBlocked: boolean; // Whether the crawler should be blocked
crawlerType: 'ai' | 'seo' | 'custom' | null; // Type of crawler detected
userAgent: string; // The original user agent string
matchedPattern?: string | RegExp; // Pattern that matched (if blocked)
}
```
### `extractUserAgent(headers: HeadersLike): string`
Utility function to extract user agent from various header formats.
**Supports:**
- Express-style headers: `{ 'user-agent': 'string' }`
- Web API Headers: `headers.get('user-agent')`
- Node.js IncomingMessage: `req.headers['user-agent']`
### `detectCrawlerType(userAgent: string): 'ai' | 'seo' | null`
Detect what type of crawler the user agent represents without blocking logic.
### Constants
- `AI_CRAWLER_PATTERNS`: Array of patterns for AI crawlers
- `SEO_CRAWLER_PATTERNS`: Array of patterns for SEO crawlers
- `DEFAULT_CONFIG`: Default configuration object
## Contributing
Contributions are welcome! Please feel free to submit issues and pull requests.
## License
MIT © [Your Name]
## Changelog
### 1.0.0
- Initial release
- Framework-agnostic design
- Comprehensive AI crawler detection
- Optional SEO crawler detection
- TypeScript support
- Zero dependencies

39
examples/express.js Normal file
View File

@ -0,0 +1,39 @@
const express = require('express');
const { shouldBlockCrawler, extractUserAgent } = require('crawl-me-not');
const app = express();
// Middleware to block AI crawlers
app.use((req, res, next) => {
const userAgent = extractUserAgent(req.headers);
const result = shouldBlockCrawler(userAgent, {
blockAI: true,
blockSEO: false,
debug: true
});
if (result.isBlocked) {
console.log(`Blocked ${result.crawlerType} crawler: ${result.userAgent}`);
return res.status(403).json({
error: 'Access denied',
reason: `${result.crawlerType} crawler detected`,
userAgent: result.userAgent
});
}
next();
});
app.get('/', (req, res) => {
res.json({ message: 'Hello World! AI crawlers are blocked.' });
});
app.get('/api/data', (req, res) => {
res.json({ data: 'This API is protected from AI crawlers' });
});
const port = process.env.PORT || 3000;
app.listen(port, () => {
console.log(`Server running on port ${port}`);
console.log('AI crawlers will receive a 403 response');
});

View File

@ -0,0 +1,46 @@
import { shouldBlockCrawler, extractUserAgent } from 'crawl-me-not';
import { NextRequest, NextResponse } from 'next/server';
export function middleware(request: NextRequest) {
const userAgent = extractUserAgent(request.headers);
const result = shouldBlockCrawler(userAgent, {
blockAI: true,
blockSEO: false,
customBlocked: [/scrapy/i, /curl/i], // Block additional patterns
debug: true
});
if (result.isBlocked) {
console.log(`Blocked ${result.crawlerType} crawler: ${result.userAgent}`);
return NextResponse.json(
{
error: 'Access denied',
reason: `${result.crawlerType} crawler detected`,
userAgent: result.userAgent
},
{
status: 403,
headers: {
'X-Blocked-Reason': 'Automated traffic detected'
}
}
);
}
return NextResponse.next();
}
// Configure which paths to run middleware on
export const config = {
matcher: [
/*
* Match all request paths except for the ones starting with:
* - api (API routes)
* - _next/static (static files)
* - _next/image (image optimization files)
* - favicon.ico (favicon file)
*/
'/((?!api|_next/static|_next/image|favicon.ico).*)',
],
};

View File

@ -0,0 +1,32 @@
import { shouldBlockCrawler, extractUserAgent } from 'crawl-me-not';
import type { Handle } from '@sveltejs/kit';
export const handle: Handle = async ({ event, resolve }) => {
const userAgent = extractUserAgent(event.request.headers);
const result = shouldBlockCrawler(userAgent, {
blockAI: true,
blockSEO: false,
debug: true
});
if (result.isBlocked) {
console.log(`Blocked ${result.crawlerType} crawler: ${result.userAgent}`);
return new Response('Access denied', {
status: 403,
headers: {
'Content-Type': 'application/json',
'X-Blocked-Reason': `${result.crawlerType} crawler detected`
}
});
}
return resolve(event);
};
// If you need to compose multiple handles:
// import { sequence } from '@sveltejs/kit/hooks';
//
// export const handle = sequence(
// crawlerBlockingHandle,
// // your other handles...
// );

39
examples/vanilla-node.js Normal file
View File

@ -0,0 +1,39 @@
const http = require('node:http');
const { shouldBlockCrawler, extractUserAgent } = require('crawl-me-not');
const server = http.createServer((req, res) => {
const userAgent = extractUserAgent(req.headers);
const result = shouldBlockCrawler(userAgent, {
blockAI: true,
blockSEO: false,
debug: true
});
if (result.isBlocked) {
console.log(`Blocked ${result.crawlerType} crawler: ${result.userAgent}`);
res.statusCode = 403;
res.setHeader('Content-Type', 'application/json');
res.setHeader('X-Blocked-Reason', 'AI crawler detected');
res.end(JSON.stringify({
error: 'Access denied',
reason: `${result.crawlerType} crawler detected`,
userAgent: result.userAgent
}));
return;
}
// Normal request handling
res.statusCode = 200;
res.setHeader('Content-Type', 'application/json');
res.end(JSON.stringify({
message: 'Hello World!',
timestamp: new Date().toISOString()
}));
});
const port = process.env.PORT || 3000;
server.listen(port, () => {
console.log(`Server running on port ${port}`);
console.log('AI crawlers will receive a 403 response');
});

21
jest.config.js Normal file
View File

@ -0,0 +1,21 @@
module.exports = {
preset: 'ts-jest',
testEnvironment: 'node',
roots: ['<rootDir>/src'],
testMatch: ['**/__tests__/**/*.ts', '**/?(*.)+(spec|test).ts'],
transform: {
'^.+\\.ts$': ['ts-jest', {
useESM: true,
}],
},
collectCoverageFrom: [
'src/**/*.ts',
'!src/**/*.d.ts',
],
coverageDirectory: 'coverage',
coverageReporters: ['text', 'lcov', 'html'],
moduleNameMapper: {
'^(\\.{1,2}/.*)\\.js$': '$1',
},
extensionsToTreatAsEsm: ['.ts'],
};

5310
package-lock.json generated Normal file

File diff suppressed because it is too large Load Diff

57
package.json Normal file
View File

@ -0,0 +1,57 @@
{
"name": "@silentsilas/crawl-me-not",
"version": "1.0.0",
"description": "Detect and block AI crawlers and SEO crawlers from any web server or framework",
"main": "dist/index.js",
"types": "dist/index.d.ts",
"files": [
"dist",
"README.md",
"LICENSE"
],
"scripts": {
"build": "tsc",
"dev": "tsc --watch",
"lint": "eslint src/**/*.ts",
"lint:fix": "eslint src/**/*.ts --fix",
"test": "jest",
"prepublishOnly": "npm run build",
"clean": "rm -rf dist"
},
"keywords": [
"crawler",
"bot",
"ai",
"seo",
"middleware",
"robots",
"scraping",
"protection",
"user-agent",
"detection",
"framework-agnostic"
],
"author": "Silas",
"license": "MIT",
"repository": {
"type": "git",
"url": "git@git.silentsilas.com:silentsilas/crawl-me-not.git"
},
"bugs": {
"url": "https://git.silentsilas.com/silentsilas/crawl-me-not/issues"
},
"homepage": "https://git.silentsilas.com/silentsilas/crawl-me-not#readme",
"devDependencies": {
"@types/jest": "^29.5.8",
"@types/node": "^20.9.0",
"@typescript-eslint/eslint-plugin": "^6.12.0",
"@typescript-eslint/parser": "^6.12.0",
"eslint": "^8.54.0",
"jest": "^29.7.0",
"ts-jest": "^29.1.1",
"typescript": "^5.2.2"
},
"engines": {
"node": ">=16.0.0"
}
}

58
src/core.test.ts Normal file
View File

@ -0,0 +1,58 @@
import { shouldBlockCrawler, extractUserAgent } from './core.js';
describe('shouldBlockCrawler', () => {
test('should block AI crawlers by default', () => {
const result = shouldBlockCrawler('GPTBot/1.0');
expect(result.isBlocked).toBe(true);
expect(result.crawlerType).toBe('ai');
});
test('should not block SEO crawlers by default', () => {
const result = shouldBlockCrawler('Googlebot/2.1');
expect(result.isBlocked).toBe(false);
expect(result.crawlerType).toBe('seo');
});
test('should respect whitelist', () => {
const result = shouldBlockCrawler('GPTBot/1.0', {
whitelist: [/GPTBot/i]
});
expect(result.isBlocked).toBe(false);
});
test('should block custom patterns', () => {
const result = shouldBlockCrawler('CustomBot/1.0', {
customBlocked: [/CustomBot/i]
});
expect(result.isBlocked).toBe(true);
expect(result.crawlerType).toBe('custom');
});
test('should allow regular browsers', () => {
const result = shouldBlockCrawler('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36');
expect(result.isBlocked).toBe(false);
expect(result.crawlerType).toBe(null);
});
});
describe('extractUserAgent', () => {
test('should extract from Express-style headers', () => {
const headers = { 'user-agent': 'TestBot/1.0' };
const userAgent = extractUserAgent(headers);
expect(userAgent).toBe('TestBot/1.0');
});
test('should extract from SvelteKit-style headers', () => {
const headers = {
get: (name: string) => name === 'user-agent' ? 'TestBot/1.0' : null
};
const userAgent = extractUserAgent(headers);
expect(userAgent).toBe('TestBot/1.0');
});
test('should handle missing user agent', () => {
const headers = {};
const userAgent = extractUserAgent(headers);
expect(userAgent).toBe('');
});
});

130
src/core.ts Normal file
View File

@ -0,0 +1,130 @@
import { CrawlerConfig, CrawlerDetectionResult, HeadersLike } from './types.js';
import { AI_CRAWLER_PATTERNS, SEO_CRAWLER_PATTERNS, matchesPatterns, detectCrawlerType } from './crawlers.js';
/**
* Default configuration for crawler blocking
*/
export const DEFAULT_CONFIG: Required<CrawlerConfig> = {
blockAI: true,
blockSEO: false,
message: 'Access denied',
statusCode: 403,
customBlocked: [],
whitelist: [],
headers: {},
debug: false,
};
/**
* Merge user config with default config
*/
export function mergeConfig(userConfig: CrawlerConfig = {}): Required<CrawlerConfig> {
return {
...DEFAULT_CONFIG,
...userConfig,
customBlocked: [...DEFAULT_CONFIG.customBlocked, ...(userConfig.customBlocked || [])],
whitelist: [...DEFAULT_CONFIG.whitelist, ...(userConfig.whitelist || [])],
headers: { ...DEFAULT_CONFIG.headers, ...(userConfig.headers || {}) },
};
}
/**
* Log debug information if debug mode is enabled
*/
function debug(config: Required<CrawlerConfig>, message: string): void {
if (config.debug) {
// eslint-disable-next-line no-console
console.log(`[crawl-me-not] ${message}`);
}
}
/**
* Check if a user agent should be blocked based on the configuration
*/
export function shouldBlockCrawler(userAgent: string, config: CrawlerConfig = {}): CrawlerDetectionResult {
const mergedConfig = mergeConfig(config);
// Default result
const result: CrawlerDetectionResult = {
isBlocked: false,
crawlerType: null,
userAgent,
};
debug(mergedConfig, `Checking user agent: ${userAgent}`);
// Check whitelist first (takes precedence)
const whitelistMatch = matchesPatterns(userAgent, mergedConfig.whitelist);
if (whitelistMatch.match) {
debug(mergedConfig, `User agent whitelisted by pattern: ${String(whitelistMatch.pattern)}`);
return result;
}
// Check custom blocked patterns
const customMatch = matchesPatterns(userAgent, mergedConfig.customBlocked);
if (customMatch.match && customMatch.pattern) {
debug(mergedConfig, `User agent blocked by custom pattern: ${String(customMatch.pattern)}`);
return {
...result,
isBlocked: true,
crawlerType: 'custom',
matchedPattern: customMatch.pattern,
};
}
// Detect crawler type
const crawlerType = detectCrawlerType(userAgent);
if (crawlerType === 'ai' && mergedConfig.blockAI) {
const aiMatch = matchesPatterns(userAgent, AI_CRAWLER_PATTERNS);
if (aiMatch.pattern) {
debug(mergedConfig, `AI crawler blocked: ${String(aiMatch.pattern)}`);
return {
...result,
isBlocked: true,
crawlerType: 'ai',
matchedPattern: aiMatch.pattern,
};
}
}
if (crawlerType === 'seo' && mergedConfig.blockSEO) {
const seoMatch = matchesPatterns(userAgent, SEO_CRAWLER_PATTERNS);
if (seoMatch.pattern) {
debug(mergedConfig, `SEO crawler blocked: ${String(seoMatch.pattern)}`);
return {
...result,
isBlocked: true,
crawlerType: 'seo',
matchedPattern: seoMatch.pattern,
};
}
}
debug(mergedConfig, 'User agent allowed');
return {
...result,
crawlerType,
};
}
/**
* Extract user agent from various header formats
*/
export function extractUserAgent(headers: HeadersLike): string {
// Handle Headers-like object (Web API, SvelteKit, etc.)
if (typeof (headers as { get(name: string): string | null }).get === 'function') {
const headersObj = headers as { get(name: string): string | null };
return headersObj.get('user-agent') || headersObj.get('User-Agent') || '';
}
// Handle regular object (Express, Node.js, etc.)
const headersObj = headers as Record<string, string | string[] | undefined>;
const userAgent = headersObj['user-agent'] || headersObj['User-Agent'];
if (Array.isArray(userAgent)) {
return userAgent[0] || '';
}
return userAgent || '';
}

160
src/crawlers.ts Normal file
View File

@ -0,0 +1,160 @@
/**
* Known AI crawler user agent patterns
* These are patterns for bots that are primarily used for AI training, data collection, or content scraping
*/
export const AI_CRAWLER_PATTERNS: (string | RegExp)[] = [
// OpenAI
/GPTBot/i,
/ChatGPT-User/i,
// Google AI
/Google-Extended/i,
/GoogleOther/i,
// Anthropic
/Claude-Web/i,
/ClaudeBot/i,
// Meta/Facebook AI
/FacebookBot/i,
/Meta-ExternalAgent/i,
// Bytedance/TikTok
/Bytespider/i,
/ByteDance/i,
// Common AI/ML crawlers
/CCBot/i,
/anthropic-ai/i,
/PerplexityBot/i,
/YouBot/i,
/ChatGPT/i,
/GPT/i,
/OpenAI/i,
/AI2Bot/i,
/cohere-ai/i,
// Academic/Research crawlers often used for AI training
/ArchiveBot/i,
/Internet Archive/i,
/archive\.org/i,
// Content scrapers and data collectors
/DataForSeoBot/i,
/SemrushBot/i,
/AhrefsBot/i,
/MJ12bot/i,
/DotBot/i,
/CommonCrawl/i,
/webzio/i,
/Scrapy/i,
/scrapy/i,
/python-requests/i,
/python-urllib/i,
/curl/i,
/wget/i,
/HTTPie/i,
/Postman/i,
/Insomnia/i,
// Generic AI/bot patterns
/bot.*ai/i,
/ai.*bot/i,
/crawler/i,
/scraper/i,
/spider/i,
];
/**
* Known SEO crawler user agent patterns
* These are legitimate crawlers used for SEO analysis and website monitoring
*/
export const SEO_CRAWLER_PATTERNS: (string | RegExp)[] = [
// Google SEO tools
/Googlebot/i,
/Google-Site-Verification/i,
/Google-InspectionTool/i,
// Bing
/Bingbot/i,
/BingPreview/i,
/msnbot/i,
// Yandex
/YandexBot/i,
/YandexImages/i,
/YandexMetrika/i,
// Baidu
/Baiduspider/i,
/Baidu/i,
// DuckDuckGo
/DuckDuckBot/i,
/DuckDuckGo/i,
// SEO tools
/AhrefsBot/i,
/SemrushBot/i,
/MJ12bot/i,
/DotBot/i,
/MegaIndex/i,
/BacklinkCrawler/i,
/SEOkicks/i,
/sistrix/i,
/BLEXBot/i,
// Social media crawlers
/facebookexternalhit/i,
/Twitterbot/i,
/LinkedInBot/i,
/WhatsApp/i,
/TelegramBot/i,
/SkypeUriPreview/i,
/Slackbot/i,
/Discordbot/i,
// Other search engines
/Yahoo/i,
/Slurp/i,
/Ask Jeeves/i,
/Teoma/i,
/ia_archiver/i,
/Wayback/i,
];
/**
* Check if a user agent matches any pattern in the given list
*/
export function matchesPatterns(userAgent: string, patterns: (string | RegExp)[]): { match: boolean; pattern?: string | RegExp } {
const lowerUserAgent = userAgent.toLowerCase();
for (const pattern of patterns) {
if (pattern instanceof RegExp) {
if (pattern.test(userAgent)) {
return { match: true, pattern };
}
} else if (lowerUserAgent.includes(pattern.toLowerCase())) {
return { match: true, pattern };
}
}
return { match: false };
}
/**
* Detect the type of crawler based on user agent
*/
export function detectCrawlerType(userAgent: string): 'ai' | 'seo' | null {
const aiMatch = matchesPatterns(userAgent, AI_CRAWLER_PATTERNS);
if (aiMatch.match) {
return 'ai';
}
const seoMatch = matchesPatterns(userAgent, SEO_CRAWLER_PATTERNS);
if (seoMatch.match) {
return 'seo';
}
return null;
}

24
src/index.ts Normal file
View File

@ -0,0 +1,24 @@
// Core functionality
export {
shouldBlockCrawler,
extractUserAgent,
mergeConfig,
DEFAULT_CONFIG,
} from './core.js';
// Crawler patterns and detection
export {
AI_CRAWLER_PATTERNS,
SEO_CRAWLER_PATTERNS,
matchesPatterns,
detectCrawlerType,
} from './crawlers.js';
// Types
export type {
CrawlerConfig,
CrawlerDetectionResult,
} from './types.js';
// Default export for convenience
export { shouldBlockCrawler as default } from './core.js';

30
src/types.ts Normal file
View File

@ -0,0 +1,30 @@
export interface CrawlerConfig {
/** Block AI crawlers (default: true) */
blockAI?: boolean;
/** Block SEO crawlers (default: false) */
blockSEO?: boolean;
/** Custom response message when blocking crawlers (default: "Access denied") */
message?: string;
/** HTTP status code to return when blocking (default: 403) */
statusCode?: number;
/** Additional user agents to block (regex patterns) */
customBlocked?: (string | RegExp)[];
/** User agents to always allow (takes precedence over blocking) */
whitelist?: (string | RegExp)[];
/** Custom response headers to set when blocking */
headers?: Record<string, string>;
/** Enable debug logging (default: false) */
debug?: boolean;
}
export interface CrawlerDetectionResult {
isBlocked: boolean;
crawlerType: 'ai' | 'seo' | 'custom' | null;
userAgent: string;
matchedPattern?: string | RegExp;
}
// Header types for different environments
export type HeadersLike =
| Record<string, string | string[] | undefined> // Express-style
| { get(name: string): string | null }; // Web API Headers-style

31
tsconfig.json Normal file
View File

@ -0,0 +1,31 @@
{
"compilerOptions": {
"target": "ES2020",
"module": "commonjs",
"lib": ["ES2020", "dom"],
"outDir": "./dist",
"rootDir": "./src",
"strict": true,
"esModuleInterop": true,
"skipLibCheck": true,
"forceConsistentCasingInFileNames": true,
"declaration": true,
"declarationMap": true,
"sourceMap": true,
"removeComments": false,
"noImplicitAny": true,
"noImplicitReturns": true,
"noImplicitThis": true,
"noUnusedLocals": true,
"noUnusedParameters": true,
"exactOptionalPropertyTypes": true,
"noImplicitOverride": true,
"noPropertyAccessFromIndexSignature": true,
"noUncheckedIndexedAccess": true,
"moduleResolution": "node",
"resolveJsonModule": true,
"allowSyntheticDefaultImports": true
},
"include": ["src/**/*"],
"exclude": ["node_modules", "dist", "**/*.test.ts", "**/*.spec.ts", "examples"]
}