init commit

2025-05-31 12:58:55 -04:00
commit d13ea65209
18 changed files with 6436 additions and 0 deletions
--- a/src/core.test.ts
+++ b/src/core.test.ts
@@ -0,0 +1,58 @@
+import { shouldBlockCrawler, extractUserAgent } from './core.js';
+
+describe('shouldBlockCrawler', () => {
+  test('should block AI crawlers by default', () => {
+    const result = shouldBlockCrawler('GPTBot/1.0');
+    expect(result.isBlocked).toBe(true);
+    expect(result.crawlerType).toBe('ai');
+  });
+
+  test('should not block SEO crawlers by default', () => {
+    const result = shouldBlockCrawler('Googlebot/2.1');
+    expect(result.isBlocked).toBe(false);
+    expect(result.crawlerType).toBe('seo');
+  });
+
+  test('should respect whitelist', () => {
+    const result = shouldBlockCrawler('GPTBot/1.0', {
+      whitelist: [/GPTBot/i]
+    });
+    expect(result.isBlocked).toBe(false);
+  });
+
+  test('should block custom patterns', () => {
+    const result = shouldBlockCrawler('CustomBot/1.0', {
+      customBlocked: [/CustomBot/i]
+    });
+    expect(result.isBlocked).toBe(true);
+    expect(result.crawlerType).toBe('custom');
+  });
+
+  test('should allow regular browsers', () => {
+    const result = shouldBlockCrawler('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36');
+    expect(result.isBlocked).toBe(false);
+    expect(result.crawlerType).toBe(null);
+  });
+});
+
+describe('extractUserAgent', () => {
+  test('should extract from Express-style headers', () => {
+    const headers = { 'user-agent': 'TestBot/1.0' };
+    const userAgent = extractUserAgent(headers);
+    expect(userAgent).toBe('TestBot/1.0');
+  });
+
+  test('should extract from SvelteKit-style headers', () => {
+    const headers = {
+      get: (name: string) => name === 'user-agent' ? 'TestBot/1.0' : null
+    };
+    const userAgent = extractUserAgent(headers);
+    expect(userAgent).toBe('TestBot/1.0');
+  });
+
+  test('should handle missing user agent', () => {
+    const headers = {};
+    const userAgent = extractUserAgent(headers);
+    expect(userAgent).toBe('');
+  });
+}); 
--- a/src/core.ts
+++ b/src/core.ts
@@ -0,0 +1,130 @@
+import { CrawlerConfig, CrawlerDetectionResult, HeadersLike } from './types.js';
+import { AI_CRAWLER_PATTERNS, SEO_CRAWLER_PATTERNS, matchesPatterns, detectCrawlerType } from './crawlers.js';
+
+/**
+ * Default configuration for crawler blocking
+ */
+export const DEFAULT_CONFIG: Required<CrawlerConfig> = {
+  blockAI: true,
+  blockSEO: false,
+  message: 'Access denied',
+  statusCode: 403,
+  customBlocked: [],
+  whitelist: [],
+  headers: {},
+  debug: false,
+};
+
+/**
+ * Merge user config with default config
+ */
+export function mergeConfig(userConfig: CrawlerConfig = {}): Required<CrawlerConfig> {
+  return {
+    ...DEFAULT_CONFIG,
+    ...userConfig,
+    customBlocked: [...DEFAULT_CONFIG.customBlocked, ...(userConfig.customBlocked || [])],
+    whitelist: [...DEFAULT_CONFIG.whitelist, ...(userConfig.whitelist || [])],
+    headers: { ...DEFAULT_CONFIG.headers, ...(userConfig.headers || {}) },
+  };
+}
+
+/**
+ * Log debug information if debug mode is enabled
+ */
+function debug(config: Required<CrawlerConfig>, message: string): void {
+  if (config.debug) {
+    // eslint-disable-next-line no-console
+    console.log(`[crawl-me-not] ${message}`);
+  }
+}
+
+/**
+ * Check if a user agent should be blocked based on the configuration
+ */
+export function shouldBlockCrawler(userAgent: string, config: CrawlerConfig = {}): CrawlerDetectionResult {
+  const mergedConfig = mergeConfig(config);
+  
+  // Default result
+  const result: CrawlerDetectionResult = {
+    isBlocked: false,
+    crawlerType: null,
+    userAgent,
+  };
+
+  debug(mergedConfig, `Checking user agent: ${userAgent}`);
+
+  // Check whitelist first (takes precedence)
+  const whitelistMatch = matchesPatterns(userAgent, mergedConfig.whitelist);
+  if (whitelistMatch.match) {
+    debug(mergedConfig, `User agent whitelisted by pattern: ${String(whitelistMatch.pattern)}`);
+    return result;
+  }
+
+  // Check custom blocked patterns
+  const customMatch = matchesPatterns(userAgent, mergedConfig.customBlocked);
+  if (customMatch.match && customMatch.pattern) {
+    debug(mergedConfig, `User agent blocked by custom pattern: ${String(customMatch.pattern)}`);
+    return {
+      ...result,
+      isBlocked: true,
+      crawlerType: 'custom',
+      matchedPattern: customMatch.pattern,
+    };
+  }
+
+  // Detect crawler type
+  const crawlerType = detectCrawlerType(userAgent);
+  
+  if (crawlerType === 'ai' && mergedConfig.blockAI) {
+    const aiMatch = matchesPatterns(userAgent, AI_CRAWLER_PATTERNS);
+    if (aiMatch.pattern) {
+      debug(mergedConfig, `AI crawler blocked: ${String(aiMatch.pattern)}`);
+      return {
+        ...result,
+        isBlocked: true,
+        crawlerType: 'ai',
+        matchedPattern: aiMatch.pattern,
+      };
+    }
+  }
+
+  if (crawlerType === 'seo' && mergedConfig.blockSEO) {
+    const seoMatch = matchesPatterns(userAgent, SEO_CRAWLER_PATTERNS);
+    if (seoMatch.pattern) {
+      debug(mergedConfig, `SEO crawler blocked: ${String(seoMatch.pattern)}`);
+      return {
+        ...result,
+        isBlocked: true,
+        crawlerType: 'seo',
+        matchedPattern: seoMatch.pattern,
+      };
+    }
+  }
+
+  debug(mergedConfig, 'User agent allowed');
+  return {
+    ...result,
+    crawlerType,
+  };
+}
+
+/**
+ * Extract user agent from various header formats
+ */
+export function extractUserAgent(headers: HeadersLike): string {
+  // Handle Headers-like object (Web API, SvelteKit, etc.)
+  if (typeof (headers as { get(name: string): string | null }).get === 'function') {
+    const headersObj = headers as { get(name: string): string | null };
+    return headersObj.get('user-agent') || headersObj.get('User-Agent') || '';
+  }
+  
+  // Handle regular object (Express, Node.js, etc.)
+  const headersObj = headers as Record<string, string | string[] | undefined>;
+  const userAgent = headersObj['user-agent'] || headersObj['User-Agent'];
+  
+  if (Array.isArray(userAgent)) {
+    return userAgent[0] || '';
+  }
+  
+  return userAgent || '';
+} 
--- a/src/crawlers.ts
+++ b/src/crawlers.ts
@@ -0,0 +1,160 @@
+/**
+ * Known AI crawler user agent patterns
+ * These are patterns for bots that are primarily used for AI training, data collection, or content scraping
+ */
+export const AI_CRAWLER_PATTERNS: (string | RegExp)[] = [
+  // OpenAI
+  /GPTBot/i,
+  /ChatGPT-User/i,
+  
+  // Google AI
+  /Google-Extended/i,
+  /GoogleOther/i,
+  
+  // Anthropic
+  /Claude-Web/i,
+  /ClaudeBot/i,
+  
+  // Meta/Facebook AI
+  /FacebookBot/i,
+  /Meta-ExternalAgent/i,
+  
+  // Bytedance/TikTok
+  /Bytespider/i,
+  /ByteDance/i,
+  
+  // Common AI/ML crawlers
+  /CCBot/i,
+  /anthropic-ai/i,
+  /PerplexityBot/i,
+  /YouBot/i,
+  /ChatGPT/i,
+  /GPT/i,
+  /OpenAI/i,
+  /AI2Bot/i,
+  /cohere-ai/i,
+  
+  // Academic/Research crawlers often used for AI training
+  /ArchiveBot/i,
+  /Internet Archive/i,
+  /archive\.org/i,
+  
+  // Content scrapers and data collectors
+  /DataForSeoBot/i,
+  /SemrushBot/i,
+  /AhrefsBot/i,
+  /MJ12bot/i,
+  /DotBot/i,
+  /CommonCrawl/i,
+  /webzio/i,
+  /Scrapy/i,
+  /scrapy/i,
+  /python-requests/i,
+  /python-urllib/i,
+  /curl/i,
+  /wget/i,
+  /HTTPie/i,
+  /Postman/i,
+  /Insomnia/i,
+  
+  // Generic AI/bot patterns
+  /bot.*ai/i,
+  /ai.*bot/i,
+  /crawler/i,
+  /scraper/i,
+  /spider/i,
+];
+
+/**
+ * Known SEO crawler user agent patterns
+ * These are legitimate crawlers used for SEO analysis and website monitoring
+ */
+export const SEO_CRAWLER_PATTERNS: (string | RegExp)[] = [
+  // Google SEO tools
+  /Googlebot/i,
+  /Google-Site-Verification/i,
+  /Google-InspectionTool/i,
+  
+  // Bing
+  /Bingbot/i,
+  /BingPreview/i,
+  /msnbot/i,
+  
+  // Yandex
+  /YandexBot/i,
+  /YandexImages/i,
+  /YandexMetrika/i,
+  
+  // Baidu
+  /Baiduspider/i,
+  /Baidu/i,
+  
+  // DuckDuckGo
+  /DuckDuckBot/i,
+  /DuckDuckGo/i,
+  
+  // SEO tools
+  /AhrefsBot/i,
+  /SemrushBot/i,
+  /MJ12bot/i,
+  /DotBot/i,
+  /MegaIndex/i,
+  /BacklinkCrawler/i,
+  /SEOkicks/i,
+  /sistrix/i,
+  /BLEXBot/i,
+  
+  // Social media crawlers
+  /facebookexternalhit/i,
+  /Twitterbot/i,
+  /LinkedInBot/i,
+  /WhatsApp/i,
+  /TelegramBot/i,
+  /SkypeUriPreview/i,
+  /Slackbot/i,
+  /Discordbot/i,
+  
+  // Other search engines
+  /Yahoo/i,
+  /Slurp/i,
+  /Ask Jeeves/i,
+  /Teoma/i,
+  /ia_archiver/i,
+  /Wayback/i,
+];
+
+/**
+ * Check if a user agent matches any pattern in the given list
+ */
+export function matchesPatterns(userAgent: string, patterns: (string | RegExp)[]): { match: boolean; pattern?: string | RegExp } {
+  const lowerUserAgent = userAgent.toLowerCase();
+  
+  for (const pattern of patterns) {
+    if (pattern instanceof RegExp) {
+      if (pattern.test(userAgent)) {
+        return { match: true, pattern };
+      }
+    } else if (lowerUserAgent.includes(pattern.toLowerCase())) {
+      return { match: true, pattern };
+    }
+  }
+  
+  return { match: false };
+}
+
+/**
+ * Detect the type of crawler based on user agent
+ */
+export function detectCrawlerType(userAgent: string): 'ai' | 'seo' | null {
+  const aiMatch = matchesPatterns(userAgent, AI_CRAWLER_PATTERNS);
+  if (aiMatch.match) {
+    return 'ai';
+  }
+  
+  const seoMatch = matchesPatterns(userAgent, SEO_CRAWLER_PATTERNS);
+  if (seoMatch.match) {
+    return 'seo';
+  }
+  
+  return null;
+} 
--- a/src/index.ts
+++ b/src/index.ts
@@ -0,0 +1,24 @@
+// Core functionality
+export {
+  shouldBlockCrawler,
+  extractUserAgent,
+  mergeConfig,
+  DEFAULT_CONFIG,
+} from './core.js';
+
+// Crawler patterns and detection
+export {
+  AI_CRAWLER_PATTERNS,
+  SEO_CRAWLER_PATTERNS,
+  matchesPatterns,
+  detectCrawlerType,
+} from './crawlers.js';
+
+// Types
+export type {
+  CrawlerConfig,
+  CrawlerDetectionResult,
+} from './types.js';
+
+// Default export for convenience
+export { shouldBlockCrawler as default } from './core.js'; 
--- a/src/types.ts
+++ b/src/types.ts
@@ -0,0 +1,30 @@
+export interface CrawlerConfig {
+  /** Block AI crawlers (default: true) */
+  blockAI?: boolean;
+  /** Block SEO crawlers (default: false) */
+  blockSEO?: boolean;
+  /** Custom response message when blocking crawlers (default: "Access denied") */
+  message?: string;
+  /** HTTP status code to return when blocking (default: 403) */
+  statusCode?: number;
+  /** Additional user agents to block (regex patterns) */
+  customBlocked?: (string | RegExp)[];
+  /** User agents to always allow (takes precedence over blocking) */
+  whitelist?: (string | RegExp)[];
+  /** Custom response headers to set when blocking */
+  headers?: Record<string, string>;
+  /** Enable debug logging (default: false) */
+  debug?: boolean;
+}
+
+export interface CrawlerDetectionResult {
+  isBlocked: boolean;
+  crawlerType: 'ai' | 'seo' | 'custom' | null;
+  userAgent: string;
+  matchedPattern?: string | RegExp;
+}
+
+// Header types for different environments
+export type HeadersLike = 
+  | Record<string, string | string[] | undefined>  // Express-style
+  | { get(name: string): string | null };          // Web API Headers-style