init commit
This commit is contained in:
58
src/core.test.ts
Normal file
58
src/core.test.ts
Normal file
@@ -0,0 +1,58 @@
|
||||
import { shouldBlockCrawler, extractUserAgent } from './core.js';
|
||||
|
||||
describe('shouldBlockCrawler', () => {
|
||||
test('should block AI crawlers by default', () => {
|
||||
const result = shouldBlockCrawler('GPTBot/1.0');
|
||||
expect(result.isBlocked).toBe(true);
|
||||
expect(result.crawlerType).toBe('ai');
|
||||
});
|
||||
|
||||
test('should not block SEO crawlers by default', () => {
|
||||
const result = shouldBlockCrawler('Googlebot/2.1');
|
||||
expect(result.isBlocked).toBe(false);
|
||||
expect(result.crawlerType).toBe('seo');
|
||||
});
|
||||
|
||||
test('should respect whitelist', () => {
|
||||
const result = shouldBlockCrawler('GPTBot/1.0', {
|
||||
whitelist: [/GPTBot/i]
|
||||
});
|
||||
expect(result.isBlocked).toBe(false);
|
||||
});
|
||||
|
||||
test('should block custom patterns', () => {
|
||||
const result = shouldBlockCrawler('CustomBot/1.0', {
|
||||
customBlocked: [/CustomBot/i]
|
||||
});
|
||||
expect(result.isBlocked).toBe(true);
|
||||
expect(result.crawlerType).toBe('custom');
|
||||
});
|
||||
|
||||
test('should allow regular browsers', () => {
|
||||
const result = shouldBlockCrawler('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36');
|
||||
expect(result.isBlocked).toBe(false);
|
||||
expect(result.crawlerType).toBe(null);
|
||||
});
|
||||
});
|
||||
|
||||
describe('extractUserAgent', () => {
|
||||
test('should extract from Express-style headers', () => {
|
||||
const headers = { 'user-agent': 'TestBot/1.0' };
|
||||
const userAgent = extractUserAgent(headers);
|
||||
expect(userAgent).toBe('TestBot/1.0');
|
||||
});
|
||||
|
||||
test('should extract from SvelteKit-style headers', () => {
|
||||
const headers = {
|
||||
get: (name: string) => name === 'user-agent' ? 'TestBot/1.0' : null
|
||||
};
|
||||
const userAgent = extractUserAgent(headers);
|
||||
expect(userAgent).toBe('TestBot/1.0');
|
||||
});
|
||||
|
||||
test('should handle missing user agent', () => {
|
||||
const headers = {};
|
||||
const userAgent = extractUserAgent(headers);
|
||||
expect(userAgent).toBe('');
|
||||
});
|
||||
});
|
130
src/core.ts
Normal file
130
src/core.ts
Normal file
@@ -0,0 +1,130 @@
|
||||
import { CrawlerConfig, CrawlerDetectionResult, HeadersLike } from './types.js';
|
||||
import { AI_CRAWLER_PATTERNS, SEO_CRAWLER_PATTERNS, matchesPatterns, detectCrawlerType } from './crawlers.js';
|
||||
|
||||
/**
|
||||
* Default configuration for crawler blocking
|
||||
*/
|
||||
export const DEFAULT_CONFIG: Required<CrawlerConfig> = {
|
||||
blockAI: true,
|
||||
blockSEO: false,
|
||||
message: 'Access denied',
|
||||
statusCode: 403,
|
||||
customBlocked: [],
|
||||
whitelist: [],
|
||||
headers: {},
|
||||
debug: false,
|
||||
};
|
||||
|
||||
/**
|
||||
* Merge user config with default config
|
||||
*/
|
||||
export function mergeConfig(userConfig: CrawlerConfig = {}): Required<CrawlerConfig> {
|
||||
return {
|
||||
...DEFAULT_CONFIG,
|
||||
...userConfig,
|
||||
customBlocked: [...DEFAULT_CONFIG.customBlocked, ...(userConfig.customBlocked || [])],
|
||||
whitelist: [...DEFAULT_CONFIG.whitelist, ...(userConfig.whitelist || [])],
|
||||
headers: { ...DEFAULT_CONFIG.headers, ...(userConfig.headers || {}) },
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Log debug information if debug mode is enabled
|
||||
*/
|
||||
function debug(config: Required<CrawlerConfig>, message: string): void {
|
||||
if (config.debug) {
|
||||
// eslint-disable-next-line no-console
|
||||
console.log(`[crawl-me-not] ${message}`);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if a user agent should be blocked based on the configuration
|
||||
*/
|
||||
export function shouldBlockCrawler(userAgent: string, config: CrawlerConfig = {}): CrawlerDetectionResult {
|
||||
const mergedConfig = mergeConfig(config);
|
||||
|
||||
// Default result
|
||||
const result: CrawlerDetectionResult = {
|
||||
isBlocked: false,
|
||||
crawlerType: null,
|
||||
userAgent,
|
||||
};
|
||||
|
||||
debug(mergedConfig, `Checking user agent: ${userAgent}`);
|
||||
|
||||
// Check whitelist first (takes precedence)
|
||||
const whitelistMatch = matchesPatterns(userAgent, mergedConfig.whitelist);
|
||||
if (whitelistMatch.match) {
|
||||
debug(mergedConfig, `User agent whitelisted by pattern: ${String(whitelistMatch.pattern)}`);
|
||||
return result;
|
||||
}
|
||||
|
||||
// Check custom blocked patterns
|
||||
const customMatch = matchesPatterns(userAgent, mergedConfig.customBlocked);
|
||||
if (customMatch.match && customMatch.pattern) {
|
||||
debug(mergedConfig, `User agent blocked by custom pattern: ${String(customMatch.pattern)}`);
|
||||
return {
|
||||
...result,
|
||||
isBlocked: true,
|
||||
crawlerType: 'custom',
|
||||
matchedPattern: customMatch.pattern,
|
||||
};
|
||||
}
|
||||
|
||||
// Detect crawler type
|
||||
const crawlerType = detectCrawlerType(userAgent);
|
||||
|
||||
if (crawlerType === 'ai' && mergedConfig.blockAI) {
|
||||
const aiMatch = matchesPatterns(userAgent, AI_CRAWLER_PATTERNS);
|
||||
if (aiMatch.pattern) {
|
||||
debug(mergedConfig, `AI crawler blocked: ${String(aiMatch.pattern)}`);
|
||||
return {
|
||||
...result,
|
||||
isBlocked: true,
|
||||
crawlerType: 'ai',
|
||||
matchedPattern: aiMatch.pattern,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
if (crawlerType === 'seo' && mergedConfig.blockSEO) {
|
||||
const seoMatch = matchesPatterns(userAgent, SEO_CRAWLER_PATTERNS);
|
||||
if (seoMatch.pattern) {
|
||||
debug(mergedConfig, `SEO crawler blocked: ${String(seoMatch.pattern)}`);
|
||||
return {
|
||||
...result,
|
||||
isBlocked: true,
|
||||
crawlerType: 'seo',
|
||||
matchedPattern: seoMatch.pattern,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
debug(mergedConfig, 'User agent allowed');
|
||||
return {
|
||||
...result,
|
||||
crawlerType,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract user agent from various header formats
|
||||
*/
|
||||
export function extractUserAgent(headers: HeadersLike): string {
|
||||
// Handle Headers-like object (Web API, SvelteKit, etc.)
|
||||
if (typeof (headers as { get(name: string): string | null }).get === 'function') {
|
||||
const headersObj = headers as { get(name: string): string | null };
|
||||
return headersObj.get('user-agent') || headersObj.get('User-Agent') || '';
|
||||
}
|
||||
|
||||
// Handle regular object (Express, Node.js, etc.)
|
||||
const headersObj = headers as Record<string, string | string[] | undefined>;
|
||||
const userAgent = headersObj['user-agent'] || headersObj['User-Agent'];
|
||||
|
||||
if (Array.isArray(userAgent)) {
|
||||
return userAgent[0] || '';
|
||||
}
|
||||
|
||||
return userAgent || '';
|
||||
}
|
160
src/crawlers.ts
Normal file
160
src/crawlers.ts
Normal file
@@ -0,0 +1,160 @@
|
||||
/**
|
||||
* Known AI crawler user agent patterns
|
||||
* These are patterns for bots that are primarily used for AI training, data collection, or content scraping
|
||||
*/
|
||||
export const AI_CRAWLER_PATTERNS: (string | RegExp)[] = [
|
||||
// OpenAI
|
||||
/GPTBot/i,
|
||||
/ChatGPT-User/i,
|
||||
|
||||
// Google AI
|
||||
/Google-Extended/i,
|
||||
/GoogleOther/i,
|
||||
|
||||
// Anthropic
|
||||
/Claude-Web/i,
|
||||
/ClaudeBot/i,
|
||||
|
||||
// Meta/Facebook AI
|
||||
/FacebookBot/i,
|
||||
/Meta-ExternalAgent/i,
|
||||
|
||||
// Bytedance/TikTok
|
||||
/Bytespider/i,
|
||||
/ByteDance/i,
|
||||
|
||||
// Common AI/ML crawlers
|
||||
/CCBot/i,
|
||||
/anthropic-ai/i,
|
||||
/PerplexityBot/i,
|
||||
/YouBot/i,
|
||||
/ChatGPT/i,
|
||||
/GPT/i,
|
||||
/OpenAI/i,
|
||||
/AI2Bot/i,
|
||||
/cohere-ai/i,
|
||||
|
||||
// Academic/Research crawlers often used for AI training
|
||||
/ArchiveBot/i,
|
||||
/Internet Archive/i,
|
||||
/archive\.org/i,
|
||||
|
||||
// Content scrapers and data collectors
|
||||
/DataForSeoBot/i,
|
||||
/SemrushBot/i,
|
||||
/AhrefsBot/i,
|
||||
/MJ12bot/i,
|
||||
/DotBot/i,
|
||||
/CommonCrawl/i,
|
||||
/webzio/i,
|
||||
/Scrapy/i,
|
||||
/scrapy/i,
|
||||
/python-requests/i,
|
||||
/python-urllib/i,
|
||||
/curl/i,
|
||||
/wget/i,
|
||||
/HTTPie/i,
|
||||
/Postman/i,
|
||||
/Insomnia/i,
|
||||
|
||||
// Generic AI/bot patterns
|
||||
/bot.*ai/i,
|
||||
/ai.*bot/i,
|
||||
/crawler/i,
|
||||
/scraper/i,
|
||||
/spider/i,
|
||||
];
|
||||
|
||||
/**
|
||||
* Known SEO crawler user agent patterns
|
||||
* These are legitimate crawlers used for SEO analysis and website monitoring
|
||||
*/
|
||||
export const SEO_CRAWLER_PATTERNS: (string | RegExp)[] = [
|
||||
// Google SEO tools
|
||||
/Googlebot/i,
|
||||
/Google-Site-Verification/i,
|
||||
/Google-InspectionTool/i,
|
||||
|
||||
// Bing
|
||||
/Bingbot/i,
|
||||
/BingPreview/i,
|
||||
/msnbot/i,
|
||||
|
||||
// Yandex
|
||||
/YandexBot/i,
|
||||
/YandexImages/i,
|
||||
/YandexMetrika/i,
|
||||
|
||||
// Baidu
|
||||
/Baiduspider/i,
|
||||
/Baidu/i,
|
||||
|
||||
// DuckDuckGo
|
||||
/DuckDuckBot/i,
|
||||
/DuckDuckGo/i,
|
||||
|
||||
// SEO tools
|
||||
/AhrefsBot/i,
|
||||
/SemrushBot/i,
|
||||
/MJ12bot/i,
|
||||
/DotBot/i,
|
||||
/MegaIndex/i,
|
||||
/BacklinkCrawler/i,
|
||||
/SEOkicks/i,
|
||||
/sistrix/i,
|
||||
/BLEXBot/i,
|
||||
|
||||
// Social media crawlers
|
||||
/facebookexternalhit/i,
|
||||
/Twitterbot/i,
|
||||
/LinkedInBot/i,
|
||||
/WhatsApp/i,
|
||||
/TelegramBot/i,
|
||||
/SkypeUriPreview/i,
|
||||
/Slackbot/i,
|
||||
/Discordbot/i,
|
||||
|
||||
// Other search engines
|
||||
/Yahoo/i,
|
||||
/Slurp/i,
|
||||
/Ask Jeeves/i,
|
||||
/Teoma/i,
|
||||
/ia_archiver/i,
|
||||
/Wayback/i,
|
||||
];
|
||||
|
||||
/**
|
||||
* Check if a user agent matches any pattern in the given list
|
||||
*/
|
||||
export function matchesPatterns(userAgent: string, patterns: (string | RegExp)[]): { match: boolean; pattern?: string | RegExp } {
|
||||
const lowerUserAgent = userAgent.toLowerCase();
|
||||
|
||||
for (const pattern of patterns) {
|
||||
if (pattern instanceof RegExp) {
|
||||
if (pattern.test(userAgent)) {
|
||||
return { match: true, pattern };
|
||||
}
|
||||
} else if (lowerUserAgent.includes(pattern.toLowerCase())) {
|
||||
return { match: true, pattern };
|
||||
}
|
||||
}
|
||||
|
||||
return { match: false };
|
||||
}
|
||||
|
||||
/**
|
||||
* Detect the type of crawler based on user agent
|
||||
*/
|
||||
export function detectCrawlerType(userAgent: string): 'ai' | 'seo' | null {
|
||||
const aiMatch = matchesPatterns(userAgent, AI_CRAWLER_PATTERNS);
|
||||
if (aiMatch.match) {
|
||||
return 'ai';
|
||||
}
|
||||
|
||||
const seoMatch = matchesPatterns(userAgent, SEO_CRAWLER_PATTERNS);
|
||||
if (seoMatch.match) {
|
||||
return 'seo';
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
24
src/index.ts
Normal file
24
src/index.ts
Normal file
@@ -0,0 +1,24 @@
|
||||
// Core functionality
|
||||
export {
|
||||
shouldBlockCrawler,
|
||||
extractUserAgent,
|
||||
mergeConfig,
|
||||
DEFAULT_CONFIG,
|
||||
} from './core.js';
|
||||
|
||||
// Crawler patterns and detection
|
||||
export {
|
||||
AI_CRAWLER_PATTERNS,
|
||||
SEO_CRAWLER_PATTERNS,
|
||||
matchesPatterns,
|
||||
detectCrawlerType,
|
||||
} from './crawlers.js';
|
||||
|
||||
// Types
|
||||
export type {
|
||||
CrawlerConfig,
|
||||
CrawlerDetectionResult,
|
||||
} from './types.js';
|
||||
|
||||
// Default export for convenience
|
||||
export { shouldBlockCrawler as default } from './core.js';
|
30
src/types.ts
Normal file
30
src/types.ts
Normal file
@@ -0,0 +1,30 @@
|
||||
export interface CrawlerConfig {
|
||||
/** Block AI crawlers (default: true) */
|
||||
blockAI?: boolean;
|
||||
/** Block SEO crawlers (default: false) */
|
||||
blockSEO?: boolean;
|
||||
/** Custom response message when blocking crawlers (default: "Access denied") */
|
||||
message?: string;
|
||||
/** HTTP status code to return when blocking (default: 403) */
|
||||
statusCode?: number;
|
||||
/** Additional user agents to block (regex patterns) */
|
||||
customBlocked?: (string | RegExp)[];
|
||||
/** User agents to always allow (takes precedence over blocking) */
|
||||
whitelist?: (string | RegExp)[];
|
||||
/** Custom response headers to set when blocking */
|
||||
headers?: Record<string, string>;
|
||||
/** Enable debug logging (default: false) */
|
||||
debug?: boolean;
|
||||
}
|
||||
|
||||
export interface CrawlerDetectionResult {
|
||||
isBlocked: boolean;
|
||||
crawlerType: 'ai' | 'seo' | 'custom' | null;
|
||||
userAgent: string;
|
||||
matchedPattern?: string | RegExp;
|
||||
}
|
||||
|
||||
// Header types for different environments
|
||||
export type HeadersLike =
|
||||
| Record<string, string | string[] | undefined> // Express-style
|
||||
| { get(name: string): string | null }; // Web API Headers-style
|
Reference in New Issue
Block a user