init commit

This commit is contained in:
2025-05-31 12:58:55 -04:00
commit d13ea65209
18 changed files with 6436 additions and 0 deletions

58
src/core.test.ts Normal file
View File

@@ -0,0 +1,58 @@
import { shouldBlockCrawler, extractUserAgent } from './core.js';
describe('shouldBlockCrawler', () => {
test('should block AI crawlers by default', () => {
const result = shouldBlockCrawler('GPTBot/1.0');
expect(result.isBlocked).toBe(true);
expect(result.crawlerType).toBe('ai');
});
test('should not block SEO crawlers by default', () => {
const result = shouldBlockCrawler('Googlebot/2.1');
expect(result.isBlocked).toBe(false);
expect(result.crawlerType).toBe('seo');
});
test('should respect whitelist', () => {
const result = shouldBlockCrawler('GPTBot/1.0', {
whitelist: [/GPTBot/i]
});
expect(result.isBlocked).toBe(false);
});
test('should block custom patterns', () => {
const result = shouldBlockCrawler('CustomBot/1.0', {
customBlocked: [/CustomBot/i]
});
expect(result.isBlocked).toBe(true);
expect(result.crawlerType).toBe('custom');
});
test('should allow regular browsers', () => {
const result = shouldBlockCrawler('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36');
expect(result.isBlocked).toBe(false);
expect(result.crawlerType).toBe(null);
});
});
describe('extractUserAgent', () => {
test('should extract from Express-style headers', () => {
const headers = { 'user-agent': 'TestBot/1.0' };
const userAgent = extractUserAgent(headers);
expect(userAgent).toBe('TestBot/1.0');
});
test('should extract from SvelteKit-style headers', () => {
const headers = {
get: (name: string) => name === 'user-agent' ? 'TestBot/1.0' : null
};
const userAgent = extractUserAgent(headers);
expect(userAgent).toBe('TestBot/1.0');
});
test('should handle missing user agent', () => {
const headers = {};
const userAgent = extractUserAgent(headers);
expect(userAgent).toBe('');
});
});

130
src/core.ts Normal file
View File

@@ -0,0 +1,130 @@
import { CrawlerConfig, CrawlerDetectionResult, HeadersLike } from './types.js';
import { AI_CRAWLER_PATTERNS, SEO_CRAWLER_PATTERNS, matchesPatterns, detectCrawlerType } from './crawlers.js';
/**
* Default configuration for crawler blocking
*/
export const DEFAULT_CONFIG: Required<CrawlerConfig> = {
blockAI: true,
blockSEO: false,
message: 'Access denied',
statusCode: 403,
customBlocked: [],
whitelist: [],
headers: {},
debug: false,
};
/**
* Merge user config with default config
*/
export function mergeConfig(userConfig: CrawlerConfig = {}): Required<CrawlerConfig> {
return {
...DEFAULT_CONFIG,
...userConfig,
customBlocked: [...DEFAULT_CONFIG.customBlocked, ...(userConfig.customBlocked || [])],
whitelist: [...DEFAULT_CONFIG.whitelist, ...(userConfig.whitelist || [])],
headers: { ...DEFAULT_CONFIG.headers, ...(userConfig.headers || {}) },
};
}
/**
* Log debug information if debug mode is enabled
*/
function debug(config: Required<CrawlerConfig>, message: string): void {
if (config.debug) {
// eslint-disable-next-line no-console
console.log(`[crawl-me-not] ${message}`);
}
}
/**
* Check if a user agent should be blocked based on the configuration
*/
export function shouldBlockCrawler(userAgent: string, config: CrawlerConfig = {}): CrawlerDetectionResult {
const mergedConfig = mergeConfig(config);
// Default result
const result: CrawlerDetectionResult = {
isBlocked: false,
crawlerType: null,
userAgent,
};
debug(mergedConfig, `Checking user agent: ${userAgent}`);
// Check whitelist first (takes precedence)
const whitelistMatch = matchesPatterns(userAgent, mergedConfig.whitelist);
if (whitelistMatch.match) {
debug(mergedConfig, `User agent whitelisted by pattern: ${String(whitelistMatch.pattern)}`);
return result;
}
// Check custom blocked patterns
const customMatch = matchesPatterns(userAgent, mergedConfig.customBlocked);
if (customMatch.match && customMatch.pattern) {
debug(mergedConfig, `User agent blocked by custom pattern: ${String(customMatch.pattern)}`);
return {
...result,
isBlocked: true,
crawlerType: 'custom',
matchedPattern: customMatch.pattern,
};
}
// Detect crawler type
const crawlerType = detectCrawlerType(userAgent);
if (crawlerType === 'ai' && mergedConfig.blockAI) {
const aiMatch = matchesPatterns(userAgent, AI_CRAWLER_PATTERNS);
if (aiMatch.pattern) {
debug(mergedConfig, `AI crawler blocked: ${String(aiMatch.pattern)}`);
return {
...result,
isBlocked: true,
crawlerType: 'ai',
matchedPattern: aiMatch.pattern,
};
}
}
if (crawlerType === 'seo' && mergedConfig.blockSEO) {
const seoMatch = matchesPatterns(userAgent, SEO_CRAWLER_PATTERNS);
if (seoMatch.pattern) {
debug(mergedConfig, `SEO crawler blocked: ${String(seoMatch.pattern)}`);
return {
...result,
isBlocked: true,
crawlerType: 'seo',
matchedPattern: seoMatch.pattern,
};
}
}
debug(mergedConfig, 'User agent allowed');
return {
...result,
crawlerType,
};
}
/**
* Extract user agent from various header formats
*/
export function extractUserAgent(headers: HeadersLike): string {
// Handle Headers-like object (Web API, SvelteKit, etc.)
if (typeof (headers as { get(name: string): string | null }).get === 'function') {
const headersObj = headers as { get(name: string): string | null };
return headersObj.get('user-agent') || headersObj.get('User-Agent') || '';
}
// Handle regular object (Express, Node.js, etc.)
const headersObj = headers as Record<string, string | string[] | undefined>;
const userAgent = headersObj['user-agent'] || headersObj['User-Agent'];
if (Array.isArray(userAgent)) {
return userAgent[0] || '';
}
return userAgent || '';
}

160
src/crawlers.ts Normal file
View File

@@ -0,0 +1,160 @@
/**
* Known AI crawler user agent patterns
* These are patterns for bots that are primarily used for AI training, data collection, or content scraping
*/
export const AI_CRAWLER_PATTERNS: (string | RegExp)[] = [
// OpenAI
/GPTBot/i,
/ChatGPT-User/i,
// Google AI
/Google-Extended/i,
/GoogleOther/i,
// Anthropic
/Claude-Web/i,
/ClaudeBot/i,
// Meta/Facebook AI
/FacebookBot/i,
/Meta-ExternalAgent/i,
// Bytedance/TikTok
/Bytespider/i,
/ByteDance/i,
// Common AI/ML crawlers
/CCBot/i,
/anthropic-ai/i,
/PerplexityBot/i,
/YouBot/i,
/ChatGPT/i,
/GPT/i,
/OpenAI/i,
/AI2Bot/i,
/cohere-ai/i,
// Academic/Research crawlers often used for AI training
/ArchiveBot/i,
/Internet Archive/i,
/archive\.org/i,
// Content scrapers and data collectors
/DataForSeoBot/i,
/SemrushBot/i,
/AhrefsBot/i,
/MJ12bot/i,
/DotBot/i,
/CommonCrawl/i,
/webzio/i,
/Scrapy/i,
/scrapy/i,
/python-requests/i,
/python-urllib/i,
/curl/i,
/wget/i,
/HTTPie/i,
/Postman/i,
/Insomnia/i,
// Generic AI/bot patterns
/bot.*ai/i,
/ai.*bot/i,
/crawler/i,
/scraper/i,
/spider/i,
];
/**
* Known SEO crawler user agent patterns
* These are legitimate crawlers used for SEO analysis and website monitoring
*/
export const SEO_CRAWLER_PATTERNS: (string | RegExp)[] = [
// Google SEO tools
/Googlebot/i,
/Google-Site-Verification/i,
/Google-InspectionTool/i,
// Bing
/Bingbot/i,
/BingPreview/i,
/msnbot/i,
// Yandex
/YandexBot/i,
/YandexImages/i,
/YandexMetrika/i,
// Baidu
/Baiduspider/i,
/Baidu/i,
// DuckDuckGo
/DuckDuckBot/i,
/DuckDuckGo/i,
// SEO tools
/AhrefsBot/i,
/SemrushBot/i,
/MJ12bot/i,
/DotBot/i,
/MegaIndex/i,
/BacklinkCrawler/i,
/SEOkicks/i,
/sistrix/i,
/BLEXBot/i,
// Social media crawlers
/facebookexternalhit/i,
/Twitterbot/i,
/LinkedInBot/i,
/WhatsApp/i,
/TelegramBot/i,
/SkypeUriPreview/i,
/Slackbot/i,
/Discordbot/i,
// Other search engines
/Yahoo/i,
/Slurp/i,
/Ask Jeeves/i,
/Teoma/i,
/ia_archiver/i,
/Wayback/i,
];
/**
* Check if a user agent matches any pattern in the given list
*/
export function matchesPatterns(userAgent: string, patterns: (string | RegExp)[]): { match: boolean; pattern?: string | RegExp } {
const lowerUserAgent = userAgent.toLowerCase();
for (const pattern of patterns) {
if (pattern instanceof RegExp) {
if (pattern.test(userAgent)) {
return { match: true, pattern };
}
} else if (lowerUserAgent.includes(pattern.toLowerCase())) {
return { match: true, pattern };
}
}
return { match: false };
}
/**
* Detect the type of crawler based on user agent
*/
export function detectCrawlerType(userAgent: string): 'ai' | 'seo' | null {
const aiMatch = matchesPatterns(userAgent, AI_CRAWLER_PATTERNS);
if (aiMatch.match) {
return 'ai';
}
const seoMatch = matchesPatterns(userAgent, SEO_CRAWLER_PATTERNS);
if (seoMatch.match) {
return 'seo';
}
return null;
}

24
src/index.ts Normal file
View File

@@ -0,0 +1,24 @@
// Core functionality
export {
shouldBlockCrawler,
extractUserAgent,
mergeConfig,
DEFAULT_CONFIG,
} from './core.js';
// Crawler patterns and detection
export {
AI_CRAWLER_PATTERNS,
SEO_CRAWLER_PATTERNS,
matchesPatterns,
detectCrawlerType,
} from './crawlers.js';
// Types
export type {
CrawlerConfig,
CrawlerDetectionResult,
} from './types.js';
// Default export for convenience
export { shouldBlockCrawler as default } from './core.js';

30
src/types.ts Normal file
View File

@@ -0,0 +1,30 @@
export interface CrawlerConfig {
/** Block AI crawlers (default: true) */
blockAI?: boolean;
/** Block SEO crawlers (default: false) */
blockSEO?: boolean;
/** Custom response message when blocking crawlers (default: "Access denied") */
message?: string;
/** HTTP status code to return when blocking (default: 403) */
statusCode?: number;
/** Additional user agents to block (regex patterns) */
customBlocked?: (string | RegExp)[];
/** User agents to always allow (takes precedence over blocking) */
whitelist?: (string | RegExp)[];
/** Custom response headers to set when blocking */
headers?: Record<string, string>;
/** Enable debug logging (default: false) */
debug?: boolean;
}
export interface CrawlerDetectionResult {
isBlocked: boolean;
crawlerType: 'ai' | 'seo' | 'custom' | null;
userAgent: string;
matchedPattern?: string | RegExp;
}
// Header types for different environments
export type HeadersLike =
| Record<string, string | string[] | undefined> // Express-style
| { get(name: string): string | null }; // Web API Headers-style