init commit

2025-05-31 12:58:55 -04:00
commit d13ea65209
18 changed files with 6436 additions and 0 deletions
--- a/.eslintrc.json
+++ b/.eslintrc.json
@@ -0,0 +1,23 @@
 {
  "env": {
    "browser": true,
    "es2021": true,
    "node": true,
    "jest": true
  },
  "extends": [
    "eslint:recommended"
  ],
  "parser": "@typescript-eslint/parser",
  "parserOptions": {
    "ecmaVersion": "latest",
    "sourceType": "module"
  },
  "plugins": ["@typescript-eslint"],
  "rules": {
    "@typescript-eslint/no-unused-vars": "error",
    "prefer-const": "error",
    "no-var": "error"
  },
  "ignorePatterns": ["dist/", "node_modules/", "*.js", "examples/", "**/*.test.ts", "**/*.spec.ts"]
 } 
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,59 @@
 # Dependencies
 node_modules/
 npm-debug.log*
 yarn-debug.log*
 yarn-error.log*
 # Build output
 dist/
 build/
 *.tsbuildinfo
 # Coverage directory used by tools like istanbul
 coverage/
 *.lcov
 # Runtime data
 pids
 *.pid
 *.seed
 *.pid.lock
 # Logs
 logs
 *.log
 # Environment variables
 .env
 .env.local
 .env.development.local
 .env.test.local
 .env.production.local
 # Editor directories and files
 .vscode/
 .idea/
 *.swp
 *.swo
 *~
 # OS generated files
 .DS_Store
 .DS_Store?
 ._*
 .Spotlight-V100
 .Trashes
 ehthumbs.db
 Thumbs.db
 # Temporary folders
 tmp/
 temp/
 # Testing
 .nyc_output
 # Package manager lock files (uncomment if you want to include them)
 # package-lock.json
 # yarn.lock
 # pnpm-lock.yaml 
--- a/.tool-versions
+++ b/.tool-versions
@@ -0,0 +1 @@
 nodejs 22.16.0
--- a/21
+++ b/21
@@ -0,0 +1,21 @@
 MIT License
 Copyright (c) 2024 crawl-me-not contributors
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 copies of the Software, and to permit persons to whom the Software is
 furnished to do so, subject to the following conditions:
 The above copyright notice and this permission notice shall be included in all
 copies or substantial portions of the Software.
 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE. 
--- a/README.md
+++ b/README.md
@@ -0,0 +1,355 @@
 # crawl-me-not 🚫🤖
 A lightweight, framework-agnostic library to detect and block AI crawlers and SEO crawlers from any web server or framework.
 ## Features
 - 🚫 **Block AI Crawlers**: Detect 43+ AI training bots like GPTBot, ChatGPT-User, Claude-Web, and more
 - 🔍 **Optional SEO Blocking**: Also detect SEO crawlers when needed
 - 🎯 **Framework Agnostic**: Works with Express, SvelteKit, Next.js, Fastify, vanilla Node.js, and more
 - 🛠️ **Highly Configurable**: Custom patterns, whitelists, response messages, and headers
 - 📝 **TypeScript**: Full TypeScript support with detailed type definitions
 - 🪶 **Zero Dependencies**: Lightweight with no external dependencies
 - 🧪 **Well Tested**: Comprehensive test coverage
 ## Installation
 ```bash
 npm install crawl-me-not
 ```
 ## Quick Start
 ```typescript
 import { shouldBlockCrawler, extractUserAgent } from 'crawl-me-not';
 // Basic usage
 const userAgent = extractUserAgent(request.headers);
 const result = shouldBlockCrawler(userAgent);
 if (result.isBlocked) {
  // Send 403 response
  return new Response('Access denied', { status: 403 });
 }
 // Continue with normal request handling
 ```
 ## Configuration Options
 ```typescript
 interface CrawlerConfig {
  blockAI?: boolean;           // Block AI crawlers (default: true)
  blockSEO?: boolean;          // Block SEO crawlers (default: false)
  message?: string;            // Custom response message (default: "Access denied")
  statusCode?: number;         // HTTP status code (default: 403)
  customBlocked?: (string | RegExp)[]; // Additional patterns to block
  whitelist?: (string | RegExp)[];     // Patterns to always allow
  headers?: Record<string, string>;    // Custom response headers
  debug?: boolean;             // Enable debug logging (default: false)
 }
 ```
 ## Framework Examples
 ### Express
 ```typescript
 import express from 'express';
 import { shouldBlockCrawler, extractUserAgent } from 'crawl-me-not';
 const app = express();
 app.use((req, res, next) => {
  const userAgent = extractUserAgent(req.headers);
  const result = shouldBlockCrawler(userAgent, {
    blockAI: true,
    blockSEO: false,
    debug: true
  });
  if (result.isBlocked) {
    return res.status(403).json({
      error: 'Access denied',
      reason: `${result.crawlerType} crawler detected`,
      userAgent: result.userAgent
    });
  }
  next();
 });
 ```
 ### SvelteKit
 ```typescript
 // src/hooks.server.ts
 import { shouldBlockCrawler, extractUserAgent } from 'crawl-me-not';
 import type { Handle } from '@sveltejs/kit';
 export const handle: Handle = async ({ event, resolve }) => {
  const userAgent = extractUserAgent(event.request.headers);
  const result = shouldBlockCrawler(userAgent);
  if (result.isBlocked) {
    return new Response('Access denied', {
      status: 403,
      headers: { 'X-Blocked-Reason': 'AI crawler detected' }
    });
  }
  return resolve(event);
 };
 ```
 ### Next.js (App Router)
 ```typescript
 // middleware.ts
 import { shouldBlockCrawler, extractUserAgent } from 'crawl-me-not';
 import { NextRequest, NextResponse } from 'next/server';
 export function middleware(request: NextRequest) {
  const userAgent = extractUserAgent(request.headers);
  const result = shouldBlockCrawler(userAgent);
  if (result.isBlocked) {
    return NextResponse.json(
      { error: 'Access denied' },
      { status: 403 }
    );
  }
  return NextResponse.next();
 }
 ```
 ### Next.js (Pages Router)
 ```typescript
 // pages/api/[...all].ts
 import { shouldBlockCrawler, extractUserAgent } from 'crawl-me-not';
 import type { NextApiRequest, NextApiResponse } from 'next';
 export default function handler(req: NextApiRequest, res: NextApiResponse) {
  const userAgent = extractUserAgent(req.headers);
  const result = shouldBlockCrawler(userAgent);
  if (result.isBlocked) {
    return res.status(403).json({ error: 'Access denied' });
  }
  // Continue with your API logic
  res.status(200).json({ message: 'Hello World' });
 }
 ```
 ### Fastify
 ```typescript
 import Fastify from 'fastify';
 import { shouldBlockCrawler, extractUserAgent } from 'crawl-me-not';
 const fastify = Fastify();
 fastify.addHook('preHandler', async (request, reply) => {
  const userAgent = extractUserAgent(request.headers);
  const result = shouldBlockCrawler(userAgent);
  if (result.isBlocked) {
    reply.status(403).send({ error: 'Access denied' });
    return;
  }
 });
 ```
 ### Vanilla Node.js
 ```typescript
 import http from 'node:http';
 import { shouldBlockCrawler, extractUserAgent } from 'crawl-me-not';
 const server = http.createServer((req, res) => {
  const userAgent = extractUserAgent(req.headers);
  const result = shouldBlockCrawler(userAgent);
  if (result.isBlocked) {
    res.statusCode = 403;
    res.setHeader('Content-Type', 'application/json');
    res.end(JSON.stringify({ error: 'Access denied' }));
    return;
  }
  // Your normal request handling
  res.statusCode = 200;
  res.end('Hello World!');
 });
 ```
 ### Bun
 ```typescript
 import { shouldBlockCrawler, extractUserAgent } from 'crawl-me-not';
 Bun.serve({
  fetch(request) {
    const userAgent = extractUserAgent(request.headers);
    const result = shouldBlockCrawler(userAgent);
    if (result.isBlocked) {
      return new Response('Access denied', { status: 403 });
    }
    return new Response('Hello World!');
  },
 });
 ```
 ## Advanced Usage
 ### Custom Configuration
 ```typescript
 const result = shouldBlockCrawler(userAgent, {
  blockAI: true,
  blockSEO: false,
  customBlocked: [
    /badbot/i,           // Block anything with "badbot"
    'unwanted-crawler',  // Block exact string match
    /scraper.*v[0-9]/i   // Block scraper versions
  ],
  whitelist: [
    /goodbot/i,          // Always allow "goodbot"
    'monitoring-service' // Always allow this service
  ],
  message: 'Custom blocking message',
  statusCode: 429,
  headers: {
    'X-Blocked-Reason': 'Automated traffic detected',
    'Retry-After': '3600'
  },
  debug: true
 });
 ```
 ### Manual Detection (Non-blocking)
 ```typescript
 import { shouldBlockCrawler, extractUserAgent } from 'crawl-me-not';
 // Just detect, don't block
 const userAgent = extractUserAgent(request.headers);
 const result = shouldBlockCrawler(userAgent, { blockAI: false, blockSEO: false });
 // Log crawler activity
 if (result.crawlerType) {
  console.log(`Detected ${result.crawlerType} crawler:`, result.userAgent);
 }
 // Apply custom logic
 if (result.crawlerType === 'ai' && isRateLimited(request)) {
  return blockResponse();
 }
 ```
 ### Rate Limiting for Crawlers
 ```typescript
 const crawlerLimits = new Map();
 app.use((req, res, next) => {
  const userAgent = extractUserAgent(req.headers);
  const result = shouldBlockCrawler(userAgent, { blockAI: false });
  if (result.crawlerType === 'ai') {
    const ip = req.ip;
    const now = Date.now();
    const limit = crawlerLimits.get(ip) || { count: 0, resetTime: now + 60000 };
    if (now > limit.resetTime) {
      limit.count = 0;
      limit.resetTime = now + 60000;
    }
    limit.count++;
    crawlerLimits.set(ip, limit);
    if (limit.count > 10) {
      return res.status(429).json({ error: 'Rate limit exceeded' });
    }
  }
  next();
 });
 ```
 ## Known Crawlers
 ### AI Crawlers (Detected by default)
 - **OpenAI**: GPTBot, ChatGPT-User
 - **Google AI**: Google-Extended, GoogleOther  
 - **Anthropic**: Claude-Web, ClaudeBot
 - **Meta/Facebook**: FacebookBot, Meta-ExternalAgent
 - **ByteDance**: Bytespider, ByteDance
 - **Others**: CCBot, PerplexityBot, YouBot, AI2Bot, cohere-ai
 - **Generic patterns**: python-requests, curl, wget, scrapy, etc.
 ### SEO Crawlers (Detected but allowed by default)
 - **Search Engines**: Googlebot, Bingbot, YandexBot, Baiduspider, DuckDuckBot
 - **SEO Tools**: AhrefsBot, SemrushBot, MJ12bot, DotBot
 - **Social Media**: facebookexternalhit, Twitterbot, LinkedInBot, WhatsApp
 ## API Reference
 ### `shouldBlockCrawler(userAgent: string, config?: CrawlerConfig): CrawlerDetectionResult`
 Main function to check if a user agent should be blocked.
 **Returns:**
 ```typescript
 interface CrawlerDetectionResult {
  isBlocked: boolean;                    // Whether the crawler should be blocked
  crawlerType: 'ai' | 'seo' | 'custom' | null; // Type of crawler detected
  userAgent: string;                     // The original user agent string
  matchedPattern?: string | RegExp;      // Pattern that matched (if blocked)
 }
 ```
 ### `extractUserAgent(headers: HeadersLike): string`
 Utility function to extract user agent from various header formats.
 **Supports:**
 - Express-style headers: `{ 'user-agent': 'string' }`
 - Web API Headers: `headers.get('user-agent')`
 - Node.js IncomingMessage: `req.headers['user-agent']`
 ### `detectCrawlerType(userAgent: string): 'ai' | 'seo' | null`
 Detect what type of crawler the user agent represents without blocking logic.
 ### Constants
 - `AI_CRAWLER_PATTERNS`: Array of patterns for AI crawlers
 - `SEO_CRAWLER_PATTERNS`: Array of patterns for SEO crawlers  
 - `DEFAULT_CONFIG`: Default configuration object
 ## Contributing
 Contributions are welcome! Please feel free to submit issues and pull requests.
 ## License
 MIT © [Your Name]
 ## Changelog
 ### 1.0.0
 - Initial release
 - Framework-agnostic design
 - Comprehensive AI crawler detection
 - Optional SEO crawler detection
 - TypeScript support
 - Zero dependencies 
--- a/examples/express.js
+++ b/examples/express.js
@@ -0,0 +1,39 @@
 const express = require('express');
 const { shouldBlockCrawler, extractUserAgent } = require('crawl-me-not');
 const app = express();
 // Middleware to block AI crawlers
 app.use((req, res, next) => {
  const userAgent = extractUserAgent(req.headers);
  const result = shouldBlockCrawler(userAgent, {
    blockAI: true,
    blockSEO: false,
    debug: true
  });
  if (result.isBlocked) {
    console.log(`Blocked ${result.crawlerType} crawler: ${result.userAgent}`);
    return res.status(403).json({
      error: 'Access denied',
      reason: `${result.crawlerType} crawler detected`,
      userAgent: result.userAgent
    });
  }
  next();
 });
 app.get('/', (req, res) => {
  res.json({ message: 'Hello World! AI crawlers are blocked.' });
 });
 app.get('/api/data', (req, res) => {
  res.json({ data: 'This API is protected from AI crawlers' });
 });
 const port = process.env.PORT || 3000;
 app.listen(port, () => {
  console.log(`Server running on port ${port}`);
  console.log('AI crawlers will receive a 403 response');
 }); 
--- a/examples/nextjs-middleware.ts
+++ b/examples/nextjs-middleware.ts
@@ -0,0 +1,46 @@
 import { shouldBlockCrawler, extractUserAgent } from 'crawl-me-not';
 import { NextRequest, NextResponse } from 'next/server';
 export function middleware(request: NextRequest) {
  const userAgent = extractUserAgent(request.headers);
  const result = shouldBlockCrawler(userAgent, {
    blockAI: true,
    blockSEO: false,
    customBlocked: [/scrapy/i, /curl/i], // Block additional patterns
    debug: true
  });
  if (result.isBlocked) {
    console.log(`Blocked ${result.crawlerType} crawler: ${result.userAgent}`);
    return NextResponse.json(
      { 
        error: 'Access denied',
        reason: `${result.crawlerType} crawler detected`,
        userAgent: result.userAgent
      },
      { 
        status: 403,
        headers: {
          'X-Blocked-Reason': 'Automated traffic detected'
        }
      }
    );
  }
  return NextResponse.next();
 }
 // Configure which paths to run middleware on
 export const config = {
  matcher: [
    /*
     * Match all request paths except for the ones starting with:
     * - api (API routes)
     * - _next/static (static files)
     * - _next/image (image optimization files)
     * - favicon.ico (favicon file)
     */
    '/((?!api|_next/static|_next/image|favicon.ico).*)',
  ],
 }; 
--- a/examples/sveltekit-hooks.server.ts
+++ b/examples/sveltekit-hooks.server.ts
@@ -0,0 +1,32 @@
 import { shouldBlockCrawler, extractUserAgent } from 'crawl-me-not';
 import type { Handle } from '@sveltejs/kit';
 export const handle: Handle = async ({ event, resolve }) => {
  const userAgent = extractUserAgent(event.request.headers);
  const result = shouldBlockCrawler(userAgent, {
    blockAI: true,
    blockSEO: false,
    debug: true
  });
  if (result.isBlocked) {
    console.log(`Blocked ${result.crawlerType} crawler: ${result.userAgent}`);
    return new Response('Access denied', {
      status: 403,
      headers: {
        'Content-Type': 'application/json',
        'X-Blocked-Reason': `${result.crawlerType} crawler detected`
      }
    });
  }
  return resolve(event);
 };
 // If you need to compose multiple handles:
 // import { sequence } from '@sveltejs/kit/hooks';
 // 
 // export const handle = sequence(
 //   crawlerBlockingHandle,
 //   // your other handles...
 // ); 
--- a/examples/vanilla-node.js
+++ b/examples/vanilla-node.js
@@ -0,0 +1,39 @@
 const http = require('node:http');
 const { shouldBlockCrawler, extractUserAgent } = require('crawl-me-not');
 const server = http.createServer((req, res) => {
  const userAgent = extractUserAgent(req.headers);
  const result = shouldBlockCrawler(userAgent, {
    blockAI: true,
    blockSEO: false,
    debug: true
  });
  if (result.isBlocked) {
    console.log(`Blocked ${result.crawlerType} crawler: ${result.userAgent}`);
    res.statusCode = 403;
    res.setHeader('Content-Type', 'application/json');
    res.setHeader('X-Blocked-Reason', 'AI crawler detected');
    res.end(JSON.stringify({
      error: 'Access denied',
      reason: `${result.crawlerType} crawler detected`,
      userAgent: result.userAgent
    }));
    return;
  }
  // Normal request handling
  res.statusCode = 200;
  res.setHeader('Content-Type', 'application/json');
  res.end(JSON.stringify({
    message: 'Hello World!',
    timestamp: new Date().toISOString()
  }));
 });
 const port = process.env.PORT || 3000;
 server.listen(port, () => {
  console.log(`Server running on port ${port}`);
  console.log('AI crawlers will receive a 403 response');
 }); 
--- a/jest.config.js
+++ b/jest.config.js
@@ -0,0 +1,21 @@
 module.exports = {
  preset: 'ts-jest',
  testEnvironment: 'node',
  roots: ['<rootDir>/src'],
  testMatch: ['**/__tests__/**/*.ts', '**/?(*.)+(spec|test).ts'],
  transform: {
    '^.+\\.ts$': ['ts-jest', {
      useESM: true,
    }],
  },
  collectCoverageFrom: [
    'src/**/*.ts',
    '!src/**/*.d.ts',
  ],
  coverageDirectory: 'coverage',
  coverageReporters: ['text', 'lcov', 'html'],
  moduleNameMapper: {
    '^(\\.{1,2}/.*)\\.js$': '$1',
  },
  extensionsToTreatAsEsm: ['.ts'],
 }; 
--- a/package-lock.json
+++ b/package-lock.json
--- a/package.json
+++ b/package.json
@@ -0,0 +1,57 @@
 {
  "name": "@silentsilas/crawl-me-not",
  "version": "1.0.0",
  "description": "Detect and block AI crawlers and SEO crawlers from any web server or framework",
  "main": "dist/index.js",
  "types": "dist/index.d.ts",
  "files": [
    "dist",
    "README.md",
    "LICENSE"
  ],
  "scripts": {
    "build": "tsc",
    "dev": "tsc --watch",
    "lint": "eslint src/**/*.ts",
    "lint:fix": "eslint src/**/*.ts --fix",
    "test": "jest",
    "prepublishOnly": "npm run build",
    "clean": "rm -rf dist"
  },
  "keywords": [
    "crawler",
    "bot",
    "ai",
    "seo",
    "middleware",
    "robots",
    "scraping",
    "protection",
    "user-agent",
    "detection",
    "framework-agnostic"
  ],
  "author": "Silas",
  "license": "MIT",
  "repository": {
    "type": "git",
    "url": "git@git.silentsilas.com:silentsilas/crawl-me-not.git"
  },
  "bugs": {
    "url": "https://git.silentsilas.com/silentsilas/crawl-me-not/issues"
  },
  "homepage": "https://git.silentsilas.com/silentsilas/crawl-me-not#readme",
  "devDependencies": {
    "@types/jest": "^29.5.8",
    "@types/node": "^20.9.0",
    "@typescript-eslint/eslint-plugin": "^6.12.0",
    "@typescript-eslint/parser": "^6.12.0",
    "eslint": "^8.54.0",
    "jest": "^29.7.0",
    "ts-jest": "^29.1.1",
    "typescript": "^5.2.2"
  },
  "engines": {
    "node": ">=16.0.0"
  }
 } 
--- a/src/core.test.ts
+++ b/src/core.test.ts
@@ -0,0 +1,58 @@
 import { shouldBlockCrawler, extractUserAgent } from './core.js';
 describe('shouldBlockCrawler', () => {
  test('should block AI crawlers by default', () => {
    const result = shouldBlockCrawler('GPTBot/1.0');
    expect(result.isBlocked).toBe(true);
    expect(result.crawlerType).toBe('ai');
  });
  test('should not block SEO crawlers by default', () => {
    const result = shouldBlockCrawler('Googlebot/2.1');
    expect(result.isBlocked).toBe(false);
    expect(result.crawlerType).toBe('seo');
  });
  test('should respect whitelist', () => {
    const result = shouldBlockCrawler('GPTBot/1.0', {
      whitelist: [/GPTBot/i]
    });
    expect(result.isBlocked).toBe(false);
  });
  test('should block custom patterns', () => {
    const result = shouldBlockCrawler('CustomBot/1.0', {
      customBlocked: [/CustomBot/i]
    });
    expect(result.isBlocked).toBe(true);
    expect(result.crawlerType).toBe('custom');
  });
  test('should allow regular browsers', () => {
    const result = shouldBlockCrawler('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36');
    expect(result.isBlocked).toBe(false);
    expect(result.crawlerType).toBe(null);
  });
 });
 describe('extractUserAgent', () => {
  test('should extract from Express-style headers', () => {
    const headers = { 'user-agent': 'TestBot/1.0' };
    const userAgent = extractUserAgent(headers);
    expect(userAgent).toBe('TestBot/1.0');
  });
  test('should extract from SvelteKit-style headers', () => {
    const headers = {
      get: (name: string) => name === 'user-agent' ? 'TestBot/1.0' : null
    };
    const userAgent = extractUserAgent(headers);
    expect(userAgent).toBe('TestBot/1.0');
  });
  test('should handle missing user agent', () => {
    const headers = {};
    const userAgent = extractUserAgent(headers);
    expect(userAgent).toBe('');
  });
 }); 
--- a/src/core.ts
+++ b/src/core.ts
@@ -0,0 +1,130 @@
 import { CrawlerConfig, CrawlerDetectionResult, HeadersLike } from './types.js';
 import { AI_CRAWLER_PATTERNS, SEO_CRAWLER_PATTERNS, matchesPatterns, detectCrawlerType } from './crawlers.js';
 /**
 * Default configuration for crawler blocking
 */
 export const DEFAULT_CONFIG: Required<CrawlerConfig> = {
  blockAI: true,
  blockSEO: false,
  message: 'Access denied',
  statusCode: 403,
  customBlocked: [],
  whitelist: [],
  headers: {},
  debug: false,
 };
 /**
 * Merge user config with default config
 */
 export function mergeConfig(userConfig: CrawlerConfig = {}): Required<CrawlerConfig> {
  return {
    ...DEFAULT_CONFIG,
    ...userConfig,
    customBlocked: [...DEFAULT_CONFIG.customBlocked, ...(userConfig.customBlocked || [])],
    whitelist: [...DEFAULT_CONFIG.whitelist, ...(userConfig.whitelist || [])],
    headers: { ...DEFAULT_CONFIG.headers, ...(userConfig.headers || {}) },
  };
 }
 /**
 * Log debug information if debug mode is enabled
 */
 function debug(config: Required<CrawlerConfig>, message: string): void {
  if (config.debug) {
    // eslint-disable-next-line no-console
    console.log(`[crawl-me-not] ${message}`);
  }
 }
 /**
 * Check if a user agent should be blocked based on the configuration
 */
 export function shouldBlockCrawler(userAgent: string, config: CrawlerConfig = {}): CrawlerDetectionResult {
  const mergedConfig = mergeConfig(config);
  // Default result
  const result: CrawlerDetectionResult = {
    isBlocked: false,
    crawlerType: null,
    userAgent,
  };
  debug(mergedConfig, `Checking user agent: ${userAgent}`);
  // Check whitelist first (takes precedence)
  const whitelistMatch = matchesPatterns(userAgent, mergedConfig.whitelist);
  if (whitelistMatch.match) {
    debug(mergedConfig, `User agent whitelisted by pattern: ${String(whitelistMatch.pattern)}`);
    return result;
  }
  // Check custom blocked patterns
  const customMatch = matchesPatterns(userAgent, mergedConfig.customBlocked);
  if (customMatch.match && customMatch.pattern) {
    debug(mergedConfig, `User agent blocked by custom pattern: ${String(customMatch.pattern)}`);
    return {
      ...result,
      isBlocked: true,
      crawlerType: 'custom',
      matchedPattern: customMatch.pattern,
    };
  }
  // Detect crawler type
  const crawlerType = detectCrawlerType(userAgent);
  if (crawlerType === 'ai' && mergedConfig.blockAI) {
    const aiMatch = matchesPatterns(userAgent, AI_CRAWLER_PATTERNS);
    if (aiMatch.pattern) {
      debug(mergedConfig, `AI crawler blocked: ${String(aiMatch.pattern)}`);
      return {
        ...result,
        isBlocked: true,
        crawlerType: 'ai',
        matchedPattern: aiMatch.pattern,
      };
    }
  }
  if (crawlerType === 'seo' && mergedConfig.blockSEO) {
    const seoMatch = matchesPatterns(userAgent, SEO_CRAWLER_PATTERNS);
    if (seoMatch.pattern) {
      debug(mergedConfig, `SEO crawler blocked: ${String(seoMatch.pattern)}`);
      return {
        ...result,
        isBlocked: true,
        crawlerType: 'seo',
        matchedPattern: seoMatch.pattern,
      };
    }
  }
  debug(mergedConfig, 'User agent allowed');
  return {
    ...result,
    crawlerType,
  };
 }
 /**
 * Extract user agent from various header formats
 */
 export function extractUserAgent(headers: HeadersLike): string {
  // Handle Headers-like object (Web API, SvelteKit, etc.)
  if (typeof (headers as { get(name: string): string | null }).get === 'function') {
    const headersObj = headers as { get(name: string): string | null };
    return headersObj.get('user-agent') || headersObj.get('User-Agent') || '';
  }
  // Handle regular object (Express, Node.js, etc.)
  const headersObj = headers as Record<string, string | string[] | undefined>;
  const userAgent = headersObj['user-agent'] || headersObj['User-Agent'];
  if (Array.isArray(userAgent)) {
    return userAgent[0] || '';
  }
  return userAgent || '';
 } 
--- a/src/crawlers.ts
+++ b/src/crawlers.ts
@@ -0,0 +1,160 @@
 /**
 * Known AI crawler user agent patterns
 * These are patterns for bots that are primarily used for AI training, data collection, or content scraping
 */
 export const AI_CRAWLER_PATTERNS: (string | RegExp)[] = [
  // OpenAI
  /GPTBot/i,
  /ChatGPT-User/i,
  // Google AI
  /Google-Extended/i,
  /GoogleOther/i,
  // Anthropic
  /Claude-Web/i,
  /ClaudeBot/i,
  // Meta/Facebook AI
  /FacebookBot/i,
  /Meta-ExternalAgent/i,
  // Bytedance/TikTok
  /Bytespider/i,
  /ByteDance/i,
  // Common AI/ML crawlers
  /CCBot/i,
  /anthropic-ai/i,
  /PerplexityBot/i,
  /YouBot/i,
  /ChatGPT/i,
  /GPT/i,
  /OpenAI/i,
  /AI2Bot/i,
  /cohere-ai/i,
  // Academic/Research crawlers often used for AI training
  /ArchiveBot/i,
  /Internet Archive/i,
  /archive\.org/i,
  // Content scrapers and data collectors
  /DataForSeoBot/i,
  /SemrushBot/i,
  /AhrefsBot/i,
  /MJ12bot/i,
  /DotBot/i,
  /CommonCrawl/i,
  /webzio/i,
  /Scrapy/i,
  /scrapy/i,
  /python-requests/i,
  /python-urllib/i,
  /curl/i,
  /wget/i,
  /HTTPie/i,
  /Postman/i,
  /Insomnia/i,
  // Generic AI/bot patterns
  /bot.*ai/i,
  /ai.*bot/i,
  /crawler/i,
  /scraper/i,
  /spider/i,
 ];
 /**
 * Known SEO crawler user agent patterns
 * These are legitimate crawlers used for SEO analysis and website monitoring
 */
 export const SEO_CRAWLER_PATTERNS: (string | RegExp)[] = [
  // Google SEO tools
  /Googlebot/i,
  /Google-Site-Verification/i,
  /Google-InspectionTool/i,
  // Bing
  /Bingbot/i,
  /BingPreview/i,
  /msnbot/i,
  // Yandex
  /YandexBot/i,
  /YandexImages/i,
  /YandexMetrika/i,
  // Baidu
  /Baiduspider/i,
  /Baidu/i,
  // DuckDuckGo
  /DuckDuckBot/i,
  /DuckDuckGo/i,
  // SEO tools
  /AhrefsBot/i,
  /SemrushBot/i,
  /MJ12bot/i,
  /DotBot/i,
  /MegaIndex/i,
  /BacklinkCrawler/i,
  /SEOkicks/i,
  /sistrix/i,
  /BLEXBot/i,
  // Social media crawlers
  /facebookexternalhit/i,
  /Twitterbot/i,
  /LinkedInBot/i,
  /WhatsApp/i,
  /TelegramBot/i,
  /SkypeUriPreview/i,
  /Slackbot/i,
  /Discordbot/i,
  // Other search engines
  /Yahoo/i,
  /Slurp/i,
  /Ask Jeeves/i,
  /Teoma/i,
  /ia_archiver/i,
  /Wayback/i,
 ];
 /**
 * Check if a user agent matches any pattern in the given list
 */
 export function matchesPatterns(userAgent: string, patterns: (string | RegExp)[]): { match: boolean; pattern?: string | RegExp } {
  const lowerUserAgent = userAgent.toLowerCase();
  for (const pattern of patterns) {
    if (pattern instanceof RegExp) {
      if (pattern.test(userAgent)) {
        return { match: true, pattern };
      }
    } else if (lowerUserAgent.includes(pattern.toLowerCase())) {
      return { match: true, pattern };
    }
  }
  return { match: false };
 }
 /**
 * Detect the type of crawler based on user agent
 */
 export function detectCrawlerType(userAgent: string): 'ai' | 'seo' | null {
  const aiMatch = matchesPatterns(userAgent, AI_CRAWLER_PATTERNS);
  if (aiMatch.match) {
    return 'ai';
  }
  const seoMatch = matchesPatterns(userAgent, SEO_CRAWLER_PATTERNS);
  if (seoMatch.match) {
    return 'seo';
  }
  return null;
 } 
--- a/src/index.ts
+++ b/src/index.ts
@@ -0,0 +1,24 @@
 // Core functionality
 export {
  shouldBlockCrawler,
  extractUserAgent,
  mergeConfig,
  DEFAULT_CONFIG,
 } from './core.js';
 // Crawler patterns and detection
 export {
  AI_CRAWLER_PATTERNS,
  SEO_CRAWLER_PATTERNS,
  matchesPatterns,
  detectCrawlerType,
 } from './crawlers.js';
 // Types
 export type {
  CrawlerConfig,
  CrawlerDetectionResult,
 } from './types.js';
 // Default export for convenience
 export { shouldBlockCrawler as default } from './core.js'; 
--- a/src/types.ts
+++ b/src/types.ts
@@ -0,0 +1,30 @@
 export interface CrawlerConfig {
  /** Block AI crawlers (default: true) */
  blockAI?: boolean;
  /** Block SEO crawlers (default: false) */
  blockSEO?: boolean;
  /** Custom response message when blocking crawlers (default: "Access denied") */
  message?: string;
  /** HTTP status code to return when blocking (default: 403) */
  statusCode?: number;
  /** Additional user agents to block (regex patterns) */
  customBlocked?: (string | RegExp)[];
  /** User agents to always allow (takes precedence over blocking) */
  whitelist?: (string | RegExp)[];
  /** Custom response headers to set when blocking */
  headers?: Record<string, string>;
  /** Enable debug logging (default: false) */
  debug?: boolean;
 }
 export interface CrawlerDetectionResult {
  isBlocked: boolean;
  crawlerType: 'ai' | 'seo' | 'custom' | null;
  userAgent: string;
  matchedPattern?: string | RegExp;
 }
 // Header types for different environments
 export type HeadersLike = 
  | Record<string, string | string[] | undefined>  // Express-style
  | { get(name: string): string | null };          // Web API Headers-style 
--- a/tsconfig.json
+++ b/tsconfig.json
@@ -0,0 +1,31 @@
 {
  "compilerOptions": {
    "target": "ES2020",
    "module": "commonjs",
    "lib": ["ES2020", "dom"],
    "outDir": "./dist",
    "rootDir": "./src",
    "strict": true,
    "esModuleInterop": true,
    "skipLibCheck": true,
    "forceConsistentCasingInFileNames": true,
    "declaration": true,
    "declarationMap": true,
    "sourceMap": true,
    "removeComments": false,
    "noImplicitAny": true,
    "noImplicitReturns": true,
    "noImplicitThis": true,
    "noUnusedLocals": true,
    "noUnusedParameters": true,
    "exactOptionalPropertyTypes": true,
    "noImplicitOverride": true,
    "noPropertyAccessFromIndexSignature": true,
    "noUncheckedIndexedAccess": true,
    "moduleResolution": "node",
    "resolveJsonModule": true,
    "allowSyntheticDefaultImports": true
  },
  "include": ["src/**/*"],
  "exclude": ["node_modules", "dist", "**/*.test.ts", "**/*.spec.ts", "examples"]
 }