init commit
This commit is contained in:
commit
d13ea65209
|
@ -0,0 +1,23 @@
|
|||
{
|
||||
"env": {
|
||||
"browser": true,
|
||||
"es2021": true,
|
||||
"node": true,
|
||||
"jest": true
|
||||
},
|
||||
"extends": [
|
||||
"eslint:recommended"
|
||||
],
|
||||
"parser": "@typescript-eslint/parser",
|
||||
"parserOptions": {
|
||||
"ecmaVersion": "latest",
|
||||
"sourceType": "module"
|
||||
},
|
||||
"plugins": ["@typescript-eslint"],
|
||||
"rules": {
|
||||
"@typescript-eslint/no-unused-vars": "error",
|
||||
"prefer-const": "error",
|
||||
"no-var": "error"
|
||||
},
|
||||
"ignorePatterns": ["dist/", "node_modules/", "*.js", "examples/", "**/*.test.ts", "**/*.spec.ts"]
|
||||
}
|
|
@ -0,0 +1,59 @@
|
|||
# Dependencies
|
||||
node_modules/
|
||||
npm-debug.log*
|
||||
yarn-debug.log*
|
||||
yarn-error.log*
|
||||
|
||||
# Build output
|
||||
dist/
|
||||
build/
|
||||
*.tsbuildinfo
|
||||
|
||||
# Coverage directory used by tools like istanbul
|
||||
coverage/
|
||||
*.lcov
|
||||
|
||||
# Runtime data
|
||||
pids
|
||||
*.pid
|
||||
*.seed
|
||||
*.pid.lock
|
||||
|
||||
# Logs
|
||||
logs
|
||||
*.log
|
||||
|
||||
# Environment variables
|
||||
.env
|
||||
.env.local
|
||||
.env.development.local
|
||||
.env.test.local
|
||||
.env.production.local
|
||||
|
||||
# Editor directories and files
|
||||
.vscode/
|
||||
.idea/
|
||||
*.swp
|
||||
*.swo
|
||||
*~
|
||||
|
||||
# OS generated files
|
||||
.DS_Store
|
||||
.DS_Store?
|
||||
._*
|
||||
.Spotlight-V100
|
||||
.Trashes
|
||||
ehthumbs.db
|
||||
Thumbs.db
|
||||
|
||||
# Temporary folders
|
||||
tmp/
|
||||
temp/
|
||||
|
||||
# Testing
|
||||
.nyc_output
|
||||
|
||||
# Package manager lock files (uncomment if you want to include them)
|
||||
# package-lock.json
|
||||
# yarn.lock
|
||||
# pnpm-lock.yaml
|
|
@ -0,0 +1 @@
|
|||
nodejs 22.16.0
|
|
@ -0,0 +1,21 @@
|
|||
MIT License
|
||||
|
||||
Copyright (c) 2024 crawl-me-not contributors
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
|
@ -0,0 +1,355 @@
|
|||
# crawl-me-not 🚫🤖
|
||||
|
||||
A lightweight, framework-agnostic library to detect and block AI crawlers and SEO crawlers from any web server or framework.
|
||||
|
||||
## Features
|
||||
|
||||
- 🚫 **Block AI Crawlers**: Detect 43+ AI training bots like GPTBot, ChatGPT-User, Claude-Web, and more
|
||||
- 🔍 **Optional SEO Blocking**: Also detect SEO crawlers when needed
|
||||
- 🎯 **Framework Agnostic**: Works with Express, SvelteKit, Next.js, Fastify, vanilla Node.js, and more
|
||||
- 🛠️ **Highly Configurable**: Custom patterns, whitelists, response messages, and headers
|
||||
- 📝 **TypeScript**: Full TypeScript support with detailed type definitions
|
||||
- 🪶 **Zero Dependencies**: Lightweight with no external dependencies
|
||||
- 🧪 **Well Tested**: Comprehensive test coverage
|
||||
|
||||
## Installation
|
||||
|
||||
```bash
|
||||
npm install crawl-me-not
|
||||
```
|
||||
|
||||
## Quick Start
|
||||
|
||||
```typescript
|
||||
import { shouldBlockCrawler, extractUserAgent } from 'crawl-me-not';
|
||||
|
||||
// Basic usage
|
||||
const userAgent = extractUserAgent(request.headers);
|
||||
const result = shouldBlockCrawler(userAgent);
|
||||
|
||||
if (result.isBlocked) {
|
||||
// Send 403 response
|
||||
return new Response('Access denied', { status: 403 });
|
||||
}
|
||||
|
||||
// Continue with normal request handling
|
||||
```
|
||||
|
||||
## Configuration Options
|
||||
|
||||
```typescript
|
||||
interface CrawlerConfig {
|
||||
blockAI?: boolean; // Block AI crawlers (default: true)
|
||||
blockSEO?: boolean; // Block SEO crawlers (default: false)
|
||||
message?: string; // Custom response message (default: "Access denied")
|
||||
statusCode?: number; // HTTP status code (default: 403)
|
||||
customBlocked?: (string | RegExp)[]; // Additional patterns to block
|
||||
whitelist?: (string | RegExp)[]; // Patterns to always allow
|
||||
headers?: Record<string, string>; // Custom response headers
|
||||
debug?: boolean; // Enable debug logging (default: false)
|
||||
}
|
||||
```
|
||||
|
||||
## Framework Examples
|
||||
|
||||
### Express
|
||||
|
||||
```typescript
|
||||
import express from 'express';
|
||||
import { shouldBlockCrawler, extractUserAgent } from 'crawl-me-not';
|
||||
|
||||
const app = express();
|
||||
|
||||
app.use((req, res, next) => {
|
||||
const userAgent = extractUserAgent(req.headers);
|
||||
const result = shouldBlockCrawler(userAgent, {
|
||||
blockAI: true,
|
||||
blockSEO: false,
|
||||
debug: true
|
||||
});
|
||||
|
||||
if (result.isBlocked) {
|
||||
return res.status(403).json({
|
||||
error: 'Access denied',
|
||||
reason: `${result.crawlerType} crawler detected`,
|
||||
userAgent: result.userAgent
|
||||
});
|
||||
}
|
||||
|
||||
next();
|
||||
});
|
||||
```
|
||||
|
||||
### SvelteKit
|
||||
|
||||
```typescript
|
||||
// src/hooks.server.ts
|
||||
import { shouldBlockCrawler, extractUserAgent } from 'crawl-me-not';
|
||||
import type { Handle } from '@sveltejs/kit';
|
||||
|
||||
export const handle: Handle = async ({ event, resolve }) => {
|
||||
const userAgent = extractUserAgent(event.request.headers);
|
||||
const result = shouldBlockCrawler(userAgent);
|
||||
|
||||
if (result.isBlocked) {
|
||||
return new Response('Access denied', {
|
||||
status: 403,
|
||||
headers: { 'X-Blocked-Reason': 'AI crawler detected' }
|
||||
});
|
||||
}
|
||||
|
||||
return resolve(event);
|
||||
};
|
||||
```
|
||||
|
||||
### Next.js (App Router)
|
||||
|
||||
```typescript
|
||||
// middleware.ts
|
||||
import { shouldBlockCrawler, extractUserAgent } from 'crawl-me-not';
|
||||
import { NextRequest, NextResponse } from 'next/server';
|
||||
|
||||
export function middleware(request: NextRequest) {
|
||||
const userAgent = extractUserAgent(request.headers);
|
||||
const result = shouldBlockCrawler(userAgent);
|
||||
|
||||
if (result.isBlocked) {
|
||||
return NextResponse.json(
|
||||
{ error: 'Access denied' },
|
||||
{ status: 403 }
|
||||
);
|
||||
}
|
||||
|
||||
return NextResponse.next();
|
||||
}
|
||||
```
|
||||
|
||||
### Next.js (Pages Router)
|
||||
|
||||
```typescript
|
||||
// pages/api/[...all].ts
|
||||
import { shouldBlockCrawler, extractUserAgent } from 'crawl-me-not';
|
||||
import type { NextApiRequest, NextApiResponse } from 'next';
|
||||
|
||||
export default function handler(req: NextApiRequest, res: NextApiResponse) {
|
||||
const userAgent = extractUserAgent(req.headers);
|
||||
const result = shouldBlockCrawler(userAgent);
|
||||
|
||||
if (result.isBlocked) {
|
||||
return res.status(403).json({ error: 'Access denied' });
|
||||
}
|
||||
|
||||
// Continue with your API logic
|
||||
res.status(200).json({ message: 'Hello World' });
|
||||
}
|
||||
```
|
||||
|
||||
### Fastify
|
||||
|
||||
```typescript
|
||||
import Fastify from 'fastify';
|
||||
import { shouldBlockCrawler, extractUserAgent } from 'crawl-me-not';
|
||||
|
||||
const fastify = Fastify();
|
||||
|
||||
fastify.addHook('preHandler', async (request, reply) => {
|
||||
const userAgent = extractUserAgent(request.headers);
|
||||
const result = shouldBlockCrawler(userAgent);
|
||||
|
||||
if (result.isBlocked) {
|
||||
reply.status(403).send({ error: 'Access denied' });
|
||||
return;
|
||||
}
|
||||
});
|
||||
```
|
||||
|
||||
### Vanilla Node.js
|
||||
|
||||
```typescript
|
||||
import http from 'node:http';
|
||||
import { shouldBlockCrawler, extractUserAgent } from 'crawl-me-not';
|
||||
|
||||
const server = http.createServer((req, res) => {
|
||||
const userAgent = extractUserAgent(req.headers);
|
||||
const result = shouldBlockCrawler(userAgent);
|
||||
|
||||
if (result.isBlocked) {
|
||||
res.statusCode = 403;
|
||||
res.setHeader('Content-Type', 'application/json');
|
||||
res.end(JSON.stringify({ error: 'Access denied' }));
|
||||
return;
|
||||
}
|
||||
|
||||
// Your normal request handling
|
||||
res.statusCode = 200;
|
||||
res.end('Hello World!');
|
||||
});
|
||||
```
|
||||
|
||||
### Bun
|
||||
|
||||
```typescript
|
||||
import { shouldBlockCrawler, extractUserAgent } from 'crawl-me-not';
|
||||
|
||||
Bun.serve({
|
||||
fetch(request) {
|
||||
const userAgent = extractUserAgent(request.headers);
|
||||
const result = shouldBlockCrawler(userAgent);
|
||||
|
||||
if (result.isBlocked) {
|
||||
return new Response('Access denied', { status: 403 });
|
||||
}
|
||||
|
||||
return new Response('Hello World!');
|
||||
},
|
||||
});
|
||||
```
|
||||
|
||||
## Advanced Usage
|
||||
|
||||
### Custom Configuration
|
||||
|
||||
```typescript
|
||||
const result = shouldBlockCrawler(userAgent, {
|
||||
blockAI: true,
|
||||
blockSEO: false,
|
||||
customBlocked: [
|
||||
/badbot/i, // Block anything with "badbot"
|
||||
'unwanted-crawler', // Block exact string match
|
||||
/scraper.*v[0-9]/i // Block scraper versions
|
||||
],
|
||||
whitelist: [
|
||||
/goodbot/i, // Always allow "goodbot"
|
||||
'monitoring-service' // Always allow this service
|
||||
],
|
||||
message: 'Custom blocking message',
|
||||
statusCode: 429,
|
||||
headers: {
|
||||
'X-Blocked-Reason': 'Automated traffic detected',
|
||||
'Retry-After': '3600'
|
||||
},
|
||||
debug: true
|
||||
});
|
||||
```
|
||||
|
||||
### Manual Detection (Non-blocking)
|
||||
|
||||
```typescript
|
||||
import { shouldBlockCrawler, extractUserAgent } from 'crawl-me-not';
|
||||
|
||||
// Just detect, don't block
|
||||
const userAgent = extractUserAgent(request.headers);
|
||||
const result = shouldBlockCrawler(userAgent, { blockAI: false, blockSEO: false });
|
||||
|
||||
// Log crawler activity
|
||||
if (result.crawlerType) {
|
||||
console.log(`Detected ${result.crawlerType} crawler:`, result.userAgent);
|
||||
}
|
||||
|
||||
// Apply custom logic
|
||||
if (result.crawlerType === 'ai' && isRateLimited(request)) {
|
||||
return blockResponse();
|
||||
}
|
||||
```
|
||||
|
||||
### Rate Limiting for Crawlers
|
||||
|
||||
```typescript
|
||||
const crawlerLimits = new Map();
|
||||
|
||||
app.use((req, res, next) => {
|
||||
const userAgent = extractUserAgent(req.headers);
|
||||
const result = shouldBlockCrawler(userAgent, { blockAI: false });
|
||||
|
||||
if (result.crawlerType === 'ai') {
|
||||
const ip = req.ip;
|
||||
const now = Date.now();
|
||||
const limit = crawlerLimits.get(ip) || { count: 0, resetTime: now + 60000 };
|
||||
|
||||
if (now > limit.resetTime) {
|
||||
limit.count = 0;
|
||||
limit.resetTime = now + 60000;
|
||||
}
|
||||
|
||||
limit.count++;
|
||||
crawlerLimits.set(ip, limit);
|
||||
|
||||
if (limit.count > 10) {
|
||||
return res.status(429).json({ error: 'Rate limit exceeded' });
|
||||
}
|
||||
}
|
||||
|
||||
next();
|
||||
});
|
||||
```
|
||||
|
||||
## Known Crawlers
|
||||
|
||||
### AI Crawlers (Detected by default)
|
||||
|
||||
- **OpenAI**: GPTBot, ChatGPT-User
|
||||
- **Google AI**: Google-Extended, GoogleOther
|
||||
- **Anthropic**: Claude-Web, ClaudeBot
|
||||
- **Meta/Facebook**: FacebookBot, Meta-ExternalAgent
|
||||
- **ByteDance**: Bytespider, ByteDance
|
||||
- **Others**: CCBot, PerplexityBot, YouBot, AI2Bot, cohere-ai
|
||||
- **Generic patterns**: python-requests, curl, wget, scrapy, etc.
|
||||
|
||||
### SEO Crawlers (Detected but allowed by default)
|
||||
|
||||
- **Search Engines**: Googlebot, Bingbot, YandexBot, Baiduspider, DuckDuckBot
|
||||
- **SEO Tools**: AhrefsBot, SemrushBot, MJ12bot, DotBot
|
||||
- **Social Media**: facebookexternalhit, Twitterbot, LinkedInBot, WhatsApp
|
||||
|
||||
## API Reference
|
||||
|
||||
### `shouldBlockCrawler(userAgent: string, config?: CrawlerConfig): CrawlerDetectionResult`
|
||||
|
||||
Main function to check if a user agent should be blocked.
|
||||
|
||||
**Returns:**
|
||||
```typescript
|
||||
interface CrawlerDetectionResult {
|
||||
isBlocked: boolean; // Whether the crawler should be blocked
|
||||
crawlerType: 'ai' | 'seo' | 'custom' | null; // Type of crawler detected
|
||||
userAgent: string; // The original user agent string
|
||||
matchedPattern?: string | RegExp; // Pattern that matched (if blocked)
|
||||
}
|
||||
```
|
||||
|
||||
### `extractUserAgent(headers: HeadersLike): string`
|
||||
|
||||
Utility function to extract user agent from various header formats.
|
||||
|
||||
**Supports:**
|
||||
- Express-style headers: `{ 'user-agent': 'string' }`
|
||||
- Web API Headers: `headers.get('user-agent')`
|
||||
- Node.js IncomingMessage: `req.headers['user-agent']`
|
||||
|
||||
### `detectCrawlerType(userAgent: string): 'ai' | 'seo' | null`
|
||||
|
||||
Detect what type of crawler the user agent represents without blocking logic.
|
||||
|
||||
### Constants
|
||||
|
||||
- `AI_CRAWLER_PATTERNS`: Array of patterns for AI crawlers
|
||||
- `SEO_CRAWLER_PATTERNS`: Array of patterns for SEO crawlers
|
||||
- `DEFAULT_CONFIG`: Default configuration object
|
||||
|
||||
## Contributing
|
||||
|
||||
Contributions are welcome! Please feel free to submit issues and pull requests.
|
||||
|
||||
## License
|
||||
|
||||
MIT © [Your Name]
|
||||
|
||||
## Changelog
|
||||
|
||||
### 1.0.0
|
||||
- Initial release
|
||||
- Framework-agnostic design
|
||||
- Comprehensive AI crawler detection
|
||||
- Optional SEO crawler detection
|
||||
- TypeScript support
|
||||
- Zero dependencies
|
|
@ -0,0 +1,39 @@
|
|||
const express = require('express');
|
||||
const { shouldBlockCrawler, extractUserAgent } = require('crawl-me-not');
|
||||
|
||||
const app = express();
|
||||
|
||||
// Middleware to block AI crawlers
|
||||
app.use((req, res, next) => {
|
||||
const userAgent = extractUserAgent(req.headers);
|
||||
const result = shouldBlockCrawler(userAgent, {
|
||||
blockAI: true,
|
||||
blockSEO: false,
|
||||
debug: true
|
||||
});
|
||||
|
||||
if (result.isBlocked) {
|
||||
console.log(`Blocked ${result.crawlerType} crawler: ${result.userAgent}`);
|
||||
return res.status(403).json({
|
||||
error: 'Access denied',
|
||||
reason: `${result.crawlerType} crawler detected`,
|
||||
userAgent: result.userAgent
|
||||
});
|
||||
}
|
||||
|
||||
next();
|
||||
});
|
||||
|
||||
app.get('/', (req, res) => {
|
||||
res.json({ message: 'Hello World! AI crawlers are blocked.' });
|
||||
});
|
||||
|
||||
app.get('/api/data', (req, res) => {
|
||||
res.json({ data: 'This API is protected from AI crawlers' });
|
||||
});
|
||||
|
||||
const port = process.env.PORT || 3000;
|
||||
app.listen(port, () => {
|
||||
console.log(`Server running on port ${port}`);
|
||||
console.log('AI crawlers will receive a 403 response');
|
||||
});
|
|
@ -0,0 +1,46 @@
|
|||
import { shouldBlockCrawler, extractUserAgent } from 'crawl-me-not';
|
||||
import { NextRequest, NextResponse } from 'next/server';
|
||||
|
||||
export function middleware(request: NextRequest) {
|
||||
const userAgent = extractUserAgent(request.headers);
|
||||
const result = shouldBlockCrawler(userAgent, {
|
||||
blockAI: true,
|
||||
blockSEO: false,
|
||||
customBlocked: [/scrapy/i, /curl/i], // Block additional patterns
|
||||
debug: true
|
||||
});
|
||||
|
||||
if (result.isBlocked) {
|
||||
console.log(`Blocked ${result.crawlerType} crawler: ${result.userAgent}`);
|
||||
|
||||
return NextResponse.json(
|
||||
{
|
||||
error: 'Access denied',
|
||||
reason: `${result.crawlerType} crawler detected`,
|
||||
userAgent: result.userAgent
|
||||
},
|
||||
{
|
||||
status: 403,
|
||||
headers: {
|
||||
'X-Blocked-Reason': 'Automated traffic detected'
|
||||
}
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
return NextResponse.next();
|
||||
}
|
||||
|
||||
// Configure which paths to run middleware on
|
||||
export const config = {
|
||||
matcher: [
|
||||
/*
|
||||
* Match all request paths except for the ones starting with:
|
||||
* - api (API routes)
|
||||
* - _next/static (static files)
|
||||
* - _next/image (image optimization files)
|
||||
* - favicon.ico (favicon file)
|
||||
*/
|
||||
'/((?!api|_next/static|_next/image|favicon.ico).*)',
|
||||
],
|
||||
};
|
|
@ -0,0 +1,32 @@
|
|||
import { shouldBlockCrawler, extractUserAgent } from 'crawl-me-not';
|
||||
import type { Handle } from '@sveltejs/kit';
|
||||
|
||||
export const handle: Handle = async ({ event, resolve }) => {
|
||||
const userAgent = extractUserAgent(event.request.headers);
|
||||
const result = shouldBlockCrawler(userAgent, {
|
||||
blockAI: true,
|
||||
blockSEO: false,
|
||||
debug: true
|
||||
});
|
||||
|
||||
if (result.isBlocked) {
|
||||
console.log(`Blocked ${result.crawlerType} crawler: ${result.userAgent}`);
|
||||
return new Response('Access denied', {
|
||||
status: 403,
|
||||
headers: {
|
||||
'Content-Type': 'application/json',
|
||||
'X-Blocked-Reason': `${result.crawlerType} crawler detected`
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
return resolve(event);
|
||||
};
|
||||
|
||||
// If you need to compose multiple handles:
|
||||
// import { sequence } from '@sveltejs/kit/hooks';
|
||||
//
|
||||
// export const handle = sequence(
|
||||
// crawlerBlockingHandle,
|
||||
// // your other handles...
|
||||
// );
|
|
@ -0,0 +1,39 @@
|
|||
const http = require('node:http');
|
||||
const { shouldBlockCrawler, extractUserAgent } = require('crawl-me-not');
|
||||
|
||||
const server = http.createServer((req, res) => {
|
||||
const userAgent = extractUserAgent(req.headers);
|
||||
const result = shouldBlockCrawler(userAgent, {
|
||||
blockAI: true,
|
||||
blockSEO: false,
|
||||
debug: true
|
||||
});
|
||||
|
||||
if (result.isBlocked) {
|
||||
console.log(`Blocked ${result.crawlerType} crawler: ${result.userAgent}`);
|
||||
|
||||
res.statusCode = 403;
|
||||
res.setHeader('Content-Type', 'application/json');
|
||||
res.setHeader('X-Blocked-Reason', 'AI crawler detected');
|
||||
res.end(JSON.stringify({
|
||||
error: 'Access denied',
|
||||
reason: `${result.crawlerType} crawler detected`,
|
||||
userAgent: result.userAgent
|
||||
}));
|
||||
return;
|
||||
}
|
||||
|
||||
// Normal request handling
|
||||
res.statusCode = 200;
|
||||
res.setHeader('Content-Type', 'application/json');
|
||||
res.end(JSON.stringify({
|
||||
message: 'Hello World!',
|
||||
timestamp: new Date().toISOString()
|
||||
}));
|
||||
});
|
||||
|
||||
const port = process.env.PORT || 3000;
|
||||
server.listen(port, () => {
|
||||
console.log(`Server running on port ${port}`);
|
||||
console.log('AI crawlers will receive a 403 response');
|
||||
});
|
|
@ -0,0 +1,21 @@
|
|||
module.exports = {
|
||||
preset: 'ts-jest',
|
||||
testEnvironment: 'node',
|
||||
roots: ['<rootDir>/src'],
|
||||
testMatch: ['**/__tests__/**/*.ts', '**/?(*.)+(spec|test).ts'],
|
||||
transform: {
|
||||
'^.+\\.ts$': ['ts-jest', {
|
||||
useESM: true,
|
||||
}],
|
||||
},
|
||||
collectCoverageFrom: [
|
||||
'src/**/*.ts',
|
||||
'!src/**/*.d.ts',
|
||||
],
|
||||
coverageDirectory: 'coverage',
|
||||
coverageReporters: ['text', 'lcov', 'html'],
|
||||
moduleNameMapper: {
|
||||
'^(\\.{1,2}/.*)\\.js$': '$1',
|
||||
},
|
||||
extensionsToTreatAsEsm: ['.ts'],
|
||||
};
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,57 @@
|
|||
{
|
||||
"name": "@silentsilas/crawl-me-not",
|
||||
"version": "1.0.0",
|
||||
"description": "Detect and block AI crawlers and SEO crawlers from any web server or framework",
|
||||
"main": "dist/index.js",
|
||||
"types": "dist/index.d.ts",
|
||||
"files": [
|
||||
"dist",
|
||||
"README.md",
|
||||
"LICENSE"
|
||||
],
|
||||
"scripts": {
|
||||
"build": "tsc",
|
||||
"dev": "tsc --watch",
|
||||
"lint": "eslint src/**/*.ts",
|
||||
"lint:fix": "eslint src/**/*.ts --fix",
|
||||
"test": "jest",
|
||||
"prepublishOnly": "npm run build",
|
||||
"clean": "rm -rf dist"
|
||||
},
|
||||
"keywords": [
|
||||
"crawler",
|
||||
"bot",
|
||||
"ai",
|
||||
"seo",
|
||||
"middleware",
|
||||
"robots",
|
||||
"scraping",
|
||||
"protection",
|
||||
"user-agent",
|
||||
"detection",
|
||||
"framework-agnostic"
|
||||
],
|
||||
"author": "Silas",
|
||||
"license": "MIT",
|
||||
"repository": {
|
||||
"type": "git",
|
||||
"url": "git@git.silentsilas.com:silentsilas/crawl-me-not.git"
|
||||
},
|
||||
"bugs": {
|
||||
"url": "https://git.silentsilas.com/silentsilas/crawl-me-not/issues"
|
||||
},
|
||||
"homepage": "https://git.silentsilas.com/silentsilas/crawl-me-not#readme",
|
||||
"devDependencies": {
|
||||
"@types/jest": "^29.5.8",
|
||||
"@types/node": "^20.9.0",
|
||||
"@typescript-eslint/eslint-plugin": "^6.12.0",
|
||||
"@typescript-eslint/parser": "^6.12.0",
|
||||
"eslint": "^8.54.0",
|
||||
"jest": "^29.7.0",
|
||||
"ts-jest": "^29.1.1",
|
||||
"typescript": "^5.2.2"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=16.0.0"
|
||||
}
|
||||
}
|
|
@ -0,0 +1,58 @@
|
|||
import { shouldBlockCrawler, extractUserAgent } from './core.js';
|
||||
|
||||
describe('shouldBlockCrawler', () => {
|
||||
test('should block AI crawlers by default', () => {
|
||||
const result = shouldBlockCrawler('GPTBot/1.0');
|
||||
expect(result.isBlocked).toBe(true);
|
||||
expect(result.crawlerType).toBe('ai');
|
||||
});
|
||||
|
||||
test('should not block SEO crawlers by default', () => {
|
||||
const result = shouldBlockCrawler('Googlebot/2.1');
|
||||
expect(result.isBlocked).toBe(false);
|
||||
expect(result.crawlerType).toBe('seo');
|
||||
});
|
||||
|
||||
test('should respect whitelist', () => {
|
||||
const result = shouldBlockCrawler('GPTBot/1.0', {
|
||||
whitelist: [/GPTBot/i]
|
||||
});
|
||||
expect(result.isBlocked).toBe(false);
|
||||
});
|
||||
|
||||
test('should block custom patterns', () => {
|
||||
const result = shouldBlockCrawler('CustomBot/1.0', {
|
||||
customBlocked: [/CustomBot/i]
|
||||
});
|
||||
expect(result.isBlocked).toBe(true);
|
||||
expect(result.crawlerType).toBe('custom');
|
||||
});
|
||||
|
||||
test('should allow regular browsers', () => {
|
||||
const result = shouldBlockCrawler('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36');
|
||||
expect(result.isBlocked).toBe(false);
|
||||
expect(result.crawlerType).toBe(null);
|
||||
});
|
||||
});
|
||||
|
||||
describe('extractUserAgent', () => {
|
||||
test('should extract from Express-style headers', () => {
|
||||
const headers = { 'user-agent': 'TestBot/1.0' };
|
||||
const userAgent = extractUserAgent(headers);
|
||||
expect(userAgent).toBe('TestBot/1.0');
|
||||
});
|
||||
|
||||
test('should extract from SvelteKit-style headers', () => {
|
||||
const headers = {
|
||||
get: (name: string) => name === 'user-agent' ? 'TestBot/1.0' : null
|
||||
};
|
||||
const userAgent = extractUserAgent(headers);
|
||||
expect(userAgent).toBe('TestBot/1.0');
|
||||
});
|
||||
|
||||
test('should handle missing user agent', () => {
|
||||
const headers = {};
|
||||
const userAgent = extractUserAgent(headers);
|
||||
expect(userAgent).toBe('');
|
||||
});
|
||||
});
|
|
@ -0,0 +1,130 @@
|
|||
import { CrawlerConfig, CrawlerDetectionResult, HeadersLike } from './types.js';
|
||||
import { AI_CRAWLER_PATTERNS, SEO_CRAWLER_PATTERNS, matchesPatterns, detectCrawlerType } from './crawlers.js';
|
||||
|
||||
/**
|
||||
* Default configuration for crawler blocking
|
||||
*/
|
||||
export const DEFAULT_CONFIG: Required<CrawlerConfig> = {
|
||||
blockAI: true,
|
||||
blockSEO: false,
|
||||
message: 'Access denied',
|
||||
statusCode: 403,
|
||||
customBlocked: [],
|
||||
whitelist: [],
|
||||
headers: {},
|
||||
debug: false,
|
||||
};
|
||||
|
||||
/**
|
||||
* Merge user config with default config
|
||||
*/
|
||||
export function mergeConfig(userConfig: CrawlerConfig = {}): Required<CrawlerConfig> {
|
||||
return {
|
||||
...DEFAULT_CONFIG,
|
||||
...userConfig,
|
||||
customBlocked: [...DEFAULT_CONFIG.customBlocked, ...(userConfig.customBlocked || [])],
|
||||
whitelist: [...DEFAULT_CONFIG.whitelist, ...(userConfig.whitelist || [])],
|
||||
headers: { ...DEFAULT_CONFIG.headers, ...(userConfig.headers || {}) },
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Log debug information if debug mode is enabled
|
||||
*/
|
||||
function debug(config: Required<CrawlerConfig>, message: string): void {
|
||||
if (config.debug) {
|
||||
// eslint-disable-next-line no-console
|
||||
console.log(`[crawl-me-not] ${message}`);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if a user agent should be blocked based on the configuration
|
||||
*/
|
||||
export function shouldBlockCrawler(userAgent: string, config: CrawlerConfig = {}): CrawlerDetectionResult {
|
||||
const mergedConfig = mergeConfig(config);
|
||||
|
||||
// Default result
|
||||
const result: CrawlerDetectionResult = {
|
||||
isBlocked: false,
|
||||
crawlerType: null,
|
||||
userAgent,
|
||||
};
|
||||
|
||||
debug(mergedConfig, `Checking user agent: ${userAgent}`);
|
||||
|
||||
// Check whitelist first (takes precedence)
|
||||
const whitelistMatch = matchesPatterns(userAgent, mergedConfig.whitelist);
|
||||
if (whitelistMatch.match) {
|
||||
debug(mergedConfig, `User agent whitelisted by pattern: ${String(whitelistMatch.pattern)}`);
|
||||
return result;
|
||||
}
|
||||
|
||||
// Check custom blocked patterns
|
||||
const customMatch = matchesPatterns(userAgent, mergedConfig.customBlocked);
|
||||
if (customMatch.match && customMatch.pattern) {
|
||||
debug(mergedConfig, `User agent blocked by custom pattern: ${String(customMatch.pattern)}`);
|
||||
return {
|
||||
...result,
|
||||
isBlocked: true,
|
||||
crawlerType: 'custom',
|
||||
matchedPattern: customMatch.pattern,
|
||||
};
|
||||
}
|
||||
|
||||
// Detect crawler type
|
||||
const crawlerType = detectCrawlerType(userAgent);
|
||||
|
||||
if (crawlerType === 'ai' && mergedConfig.blockAI) {
|
||||
const aiMatch = matchesPatterns(userAgent, AI_CRAWLER_PATTERNS);
|
||||
if (aiMatch.pattern) {
|
||||
debug(mergedConfig, `AI crawler blocked: ${String(aiMatch.pattern)}`);
|
||||
return {
|
||||
...result,
|
||||
isBlocked: true,
|
||||
crawlerType: 'ai',
|
||||
matchedPattern: aiMatch.pattern,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
if (crawlerType === 'seo' && mergedConfig.blockSEO) {
|
||||
const seoMatch = matchesPatterns(userAgent, SEO_CRAWLER_PATTERNS);
|
||||
if (seoMatch.pattern) {
|
||||
debug(mergedConfig, `SEO crawler blocked: ${String(seoMatch.pattern)}`);
|
||||
return {
|
||||
...result,
|
||||
isBlocked: true,
|
||||
crawlerType: 'seo',
|
||||
matchedPattern: seoMatch.pattern,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
debug(mergedConfig, 'User agent allowed');
|
||||
return {
|
||||
...result,
|
||||
crawlerType,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract user agent from various header formats
|
||||
*/
|
||||
export function extractUserAgent(headers: HeadersLike): string {
|
||||
// Handle Headers-like object (Web API, SvelteKit, etc.)
|
||||
if (typeof (headers as { get(name: string): string | null }).get === 'function') {
|
||||
const headersObj = headers as { get(name: string): string | null };
|
||||
return headersObj.get('user-agent') || headersObj.get('User-Agent') || '';
|
||||
}
|
||||
|
||||
// Handle regular object (Express, Node.js, etc.)
|
||||
const headersObj = headers as Record<string, string | string[] | undefined>;
|
||||
const userAgent = headersObj['user-agent'] || headersObj['User-Agent'];
|
||||
|
||||
if (Array.isArray(userAgent)) {
|
||||
return userAgent[0] || '';
|
||||
}
|
||||
|
||||
return userAgent || '';
|
||||
}
|
|
@ -0,0 +1,160 @@
|
|||
/**
|
||||
* Known AI crawler user agent patterns
|
||||
* These are patterns for bots that are primarily used for AI training, data collection, or content scraping
|
||||
*/
|
||||
export const AI_CRAWLER_PATTERNS: (string | RegExp)[] = [
|
||||
// OpenAI
|
||||
/GPTBot/i,
|
||||
/ChatGPT-User/i,
|
||||
|
||||
// Google AI
|
||||
/Google-Extended/i,
|
||||
/GoogleOther/i,
|
||||
|
||||
// Anthropic
|
||||
/Claude-Web/i,
|
||||
/ClaudeBot/i,
|
||||
|
||||
// Meta/Facebook AI
|
||||
/FacebookBot/i,
|
||||
/Meta-ExternalAgent/i,
|
||||
|
||||
// Bytedance/TikTok
|
||||
/Bytespider/i,
|
||||
/ByteDance/i,
|
||||
|
||||
// Common AI/ML crawlers
|
||||
/CCBot/i,
|
||||
/anthropic-ai/i,
|
||||
/PerplexityBot/i,
|
||||
/YouBot/i,
|
||||
/ChatGPT/i,
|
||||
/GPT/i,
|
||||
/OpenAI/i,
|
||||
/AI2Bot/i,
|
||||
/cohere-ai/i,
|
||||
|
||||
// Academic/Research crawlers often used for AI training
|
||||
/ArchiveBot/i,
|
||||
/Internet Archive/i,
|
||||
/archive\.org/i,
|
||||
|
||||
// Content scrapers and data collectors
|
||||
/DataForSeoBot/i,
|
||||
/SemrushBot/i,
|
||||
/AhrefsBot/i,
|
||||
/MJ12bot/i,
|
||||
/DotBot/i,
|
||||
/CommonCrawl/i,
|
||||
/webzio/i,
|
||||
/Scrapy/i,
|
||||
/scrapy/i,
|
||||
/python-requests/i,
|
||||
/python-urllib/i,
|
||||
/curl/i,
|
||||
/wget/i,
|
||||
/HTTPie/i,
|
||||
/Postman/i,
|
||||
/Insomnia/i,
|
||||
|
||||
// Generic AI/bot patterns
|
||||
/bot.*ai/i,
|
||||
/ai.*bot/i,
|
||||
/crawler/i,
|
||||
/scraper/i,
|
||||
/spider/i,
|
||||
];
|
||||
|
||||
/**
|
||||
* Known SEO crawler user agent patterns
|
||||
* These are legitimate crawlers used for SEO analysis and website monitoring
|
||||
*/
|
||||
export const SEO_CRAWLER_PATTERNS: (string | RegExp)[] = [
|
||||
// Google SEO tools
|
||||
/Googlebot/i,
|
||||
/Google-Site-Verification/i,
|
||||
/Google-InspectionTool/i,
|
||||
|
||||
// Bing
|
||||
/Bingbot/i,
|
||||
/BingPreview/i,
|
||||
/msnbot/i,
|
||||
|
||||
// Yandex
|
||||
/YandexBot/i,
|
||||
/YandexImages/i,
|
||||
/YandexMetrika/i,
|
||||
|
||||
// Baidu
|
||||
/Baiduspider/i,
|
||||
/Baidu/i,
|
||||
|
||||
// DuckDuckGo
|
||||
/DuckDuckBot/i,
|
||||
/DuckDuckGo/i,
|
||||
|
||||
// SEO tools
|
||||
/AhrefsBot/i,
|
||||
/SemrushBot/i,
|
||||
/MJ12bot/i,
|
||||
/DotBot/i,
|
||||
/MegaIndex/i,
|
||||
/BacklinkCrawler/i,
|
||||
/SEOkicks/i,
|
||||
/sistrix/i,
|
||||
/BLEXBot/i,
|
||||
|
||||
// Social media crawlers
|
||||
/facebookexternalhit/i,
|
||||
/Twitterbot/i,
|
||||
/LinkedInBot/i,
|
||||
/WhatsApp/i,
|
||||
/TelegramBot/i,
|
||||
/SkypeUriPreview/i,
|
||||
/Slackbot/i,
|
||||
/Discordbot/i,
|
||||
|
||||
// Other search engines
|
||||
/Yahoo/i,
|
||||
/Slurp/i,
|
||||
/Ask Jeeves/i,
|
||||
/Teoma/i,
|
||||
/ia_archiver/i,
|
||||
/Wayback/i,
|
||||
];
|
||||
|
||||
/**
|
||||
* Check if a user agent matches any pattern in the given list
|
||||
*/
|
||||
export function matchesPatterns(userAgent: string, patterns: (string | RegExp)[]): { match: boolean; pattern?: string | RegExp } {
|
||||
const lowerUserAgent = userAgent.toLowerCase();
|
||||
|
||||
for (const pattern of patterns) {
|
||||
if (pattern instanceof RegExp) {
|
||||
if (pattern.test(userAgent)) {
|
||||
return { match: true, pattern };
|
||||
}
|
||||
} else if (lowerUserAgent.includes(pattern.toLowerCase())) {
|
||||
return { match: true, pattern };
|
||||
}
|
||||
}
|
||||
|
||||
return { match: false };
|
||||
}
|
||||
|
||||
/**
|
||||
* Detect the type of crawler based on user agent
|
||||
*/
|
||||
export function detectCrawlerType(userAgent: string): 'ai' | 'seo' | null {
|
||||
const aiMatch = matchesPatterns(userAgent, AI_CRAWLER_PATTERNS);
|
||||
if (aiMatch.match) {
|
||||
return 'ai';
|
||||
}
|
||||
|
||||
const seoMatch = matchesPatterns(userAgent, SEO_CRAWLER_PATTERNS);
|
||||
if (seoMatch.match) {
|
||||
return 'seo';
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
|
@ -0,0 +1,24 @@
|
|||
// Core functionality
|
||||
export {
|
||||
shouldBlockCrawler,
|
||||
extractUserAgent,
|
||||
mergeConfig,
|
||||
DEFAULT_CONFIG,
|
||||
} from './core.js';
|
||||
|
||||
// Crawler patterns and detection
|
||||
export {
|
||||
AI_CRAWLER_PATTERNS,
|
||||
SEO_CRAWLER_PATTERNS,
|
||||
matchesPatterns,
|
||||
detectCrawlerType,
|
||||
} from './crawlers.js';
|
||||
|
||||
// Types
|
||||
export type {
|
||||
CrawlerConfig,
|
||||
CrawlerDetectionResult,
|
||||
} from './types.js';
|
||||
|
||||
// Default export for convenience
|
||||
export { shouldBlockCrawler as default } from './core.js';
|
|
@ -0,0 +1,30 @@
|
|||
export interface CrawlerConfig {
|
||||
/** Block AI crawlers (default: true) */
|
||||
blockAI?: boolean;
|
||||
/** Block SEO crawlers (default: false) */
|
||||
blockSEO?: boolean;
|
||||
/** Custom response message when blocking crawlers (default: "Access denied") */
|
||||
message?: string;
|
||||
/** HTTP status code to return when blocking (default: 403) */
|
||||
statusCode?: number;
|
||||
/** Additional user agents to block (regex patterns) */
|
||||
customBlocked?: (string | RegExp)[];
|
||||
/** User agents to always allow (takes precedence over blocking) */
|
||||
whitelist?: (string | RegExp)[];
|
||||
/** Custom response headers to set when blocking */
|
||||
headers?: Record<string, string>;
|
||||
/** Enable debug logging (default: false) */
|
||||
debug?: boolean;
|
||||
}
|
||||
|
||||
export interface CrawlerDetectionResult {
|
||||
isBlocked: boolean;
|
||||
crawlerType: 'ai' | 'seo' | 'custom' | null;
|
||||
userAgent: string;
|
||||
matchedPattern?: string | RegExp;
|
||||
}
|
||||
|
||||
// Header types for different environments
|
||||
export type HeadersLike =
|
||||
| Record<string, string | string[] | undefined> // Express-style
|
||||
| { get(name: string): string | null }; // Web API Headers-style
|
|
@ -0,0 +1,31 @@
|
|||
{
|
||||
"compilerOptions": {
|
||||
"target": "ES2020",
|
||||
"module": "commonjs",
|
||||
"lib": ["ES2020", "dom"],
|
||||
"outDir": "./dist",
|
||||
"rootDir": "./src",
|
||||
"strict": true,
|
||||
"esModuleInterop": true,
|
||||
"skipLibCheck": true,
|
||||
"forceConsistentCasingInFileNames": true,
|
||||
"declaration": true,
|
||||
"declarationMap": true,
|
||||
"sourceMap": true,
|
||||
"removeComments": false,
|
||||
"noImplicitAny": true,
|
||||
"noImplicitReturns": true,
|
||||
"noImplicitThis": true,
|
||||
"noUnusedLocals": true,
|
||||
"noUnusedParameters": true,
|
||||
"exactOptionalPropertyTypes": true,
|
||||
"noImplicitOverride": true,
|
||||
"noPropertyAccessFromIndexSignature": true,
|
||||
"noUncheckedIndexedAccess": true,
|
||||
"moduleResolution": "node",
|
||||
"resolveJsonModule": true,
|
||||
"allowSyntheticDefaultImports": true
|
||||
},
|
||||
"include": ["src/**/*"],
|
||||
"exclude": ["node_modules", "dist", "**/*.test.ts", "**/*.spec.ts", "examples"]
|
||||
}
|
Loading…
Reference in New Issue