init commit

2025-05-31 12:58:55 -04:00 · 2025-05-31 12:58:55 -04:00 · d13ea65209
commit d13ea65209
18 changed files with 6436 additions and 0 deletions
--- a/.eslintrc.json
+++ b/.eslintrc.json
@ -0,0 +1,23 @@
+{
+  "env": {
+    "browser": true,
+    "es2021": true,
+    "node": true,
+    "jest": true
+  },
+  "extends": [
+    "eslint:recommended"
+  ],
+  "parser": "@typescript-eslint/parser",
+  "parserOptions": {
+    "ecmaVersion": "latest",
+    "sourceType": "module"
+  },
+  "plugins": ["@typescript-eslint"],
+  "rules": {
+    "@typescript-eslint/no-unused-vars": "error",
+    "prefer-const": "error",
+    "no-var": "error"
+  },
+  "ignorePatterns": ["dist/", "node_modules/", "*.js", "examples/", "**/*.test.ts", "**/*.spec.ts"]
+} 
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,59 @@
+# Dependencies
+node_modules/
+npm-debug.log*
+yarn-debug.log*
+yarn-error.log*
+
+# Build output
+dist/
+build/
+*.tsbuildinfo
+
+# Coverage directory used by tools like istanbul
+coverage/
+*.lcov
+
+# Runtime data
+pids
+*.pid
+*.seed
+*.pid.lock
+
+# Logs
+logs
+*.log
+
+# Environment variables
+.env
+.env.local
+.env.development.local
+.env.test.local
+.env.production.local
+
+# Editor directories and files
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+
+# OS generated files
+.DS_Store
+.DS_Store?
+._*
+.Spotlight-V100
+.Trashes
+ehthumbs.db
+Thumbs.db
+
+# Temporary folders
+tmp/
+temp/
+
+# Testing
+.nyc_output
+
+# Package manager lock files (uncomment if you want to include them)
+# package-lock.json
+# yarn.lock
+# pnpm-lock.yaml 
--- a/.tool-versions
+++ b/.tool-versions
@ -0,0 +1 @@
+nodejs 22.16.0
--- a/21
+++ b/21
@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2024 crawl-me-not contributors
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE. 
--- a/README.md
+++ b/README.md
@ -0,0 +1,355 @@
+# crawl-me-not 🚫🤖
+
+A lightweight, framework-agnostic library to detect and block AI crawlers and SEO crawlers from any web server or framework.
+
+## Features
+
+- 🚫 **Block AI Crawlers**: Detect 43+ AI training bots like GPTBot, ChatGPT-User, Claude-Web, and more
+- 🔍 **Optional SEO Blocking**: Also detect SEO crawlers when needed
+- 🎯 **Framework Agnostic**: Works with Express, SvelteKit, Next.js, Fastify, vanilla Node.js, and more
+- 🛠️ **Highly Configurable**: Custom patterns, whitelists, response messages, and headers
+- 📝 **TypeScript**: Full TypeScript support with detailed type definitions
+- 🪶 **Zero Dependencies**: Lightweight with no external dependencies
+- 🧪 **Well Tested**: Comprehensive test coverage
+
+## Installation
+
+```bash
+npm install crawl-me-not
+```
+
+## Quick Start
+
+```typescript
+import { shouldBlockCrawler, extractUserAgent } from 'crawl-me-not';
+
+// Basic usage
+const userAgent = extractUserAgent(request.headers);
+const result = shouldBlockCrawler(userAgent);
+
+if (result.isBlocked) {
+  // Send 403 response
+  return new Response('Access denied', { status: 403 });
+}
+
+// Continue with normal request handling
+```
+
+## Configuration Options
+
+```typescript
+interface CrawlerConfig {
+  blockAI?: boolean;           // Block AI crawlers (default: true)
+  blockSEO?: boolean;          // Block SEO crawlers (default: false)
+  message?: string;            // Custom response message (default: "Access denied")
+  statusCode?: number;         // HTTP status code (default: 403)
+  customBlocked?: (string | RegExp)[]; // Additional patterns to block
+  whitelist?: (string | RegExp)[];     // Patterns to always allow
+  headers?: Record<string, string>;    // Custom response headers
+  debug?: boolean;             // Enable debug logging (default: false)
+}
+```
+
+## Framework Examples
+
+### Express
+
+```typescript
+import express from 'express';
+import { shouldBlockCrawler, extractUserAgent } from 'crawl-me-not';
+
+const app = express();
+
+app.use((req, res, next) => {
+  const userAgent = extractUserAgent(req.headers);
+  const result = shouldBlockCrawler(userAgent, {
+    blockAI: true,
+    blockSEO: false,
+    debug: true
+  });
+
+  if (result.isBlocked) {
+    return res.status(403).json({
+      error: 'Access denied',
+      reason: `${result.crawlerType} crawler detected`,
+      userAgent: result.userAgent
+    });
+  }
+
+  next();
+});
+```
+
+### SvelteKit
+
+```typescript
+// src/hooks.server.ts
+import { shouldBlockCrawler, extractUserAgent } from 'crawl-me-not';
+import type { Handle } from '@sveltejs/kit';
+
+export const handle: Handle = async ({ event, resolve }) => {
+  const userAgent = extractUserAgent(event.request.headers);
+  const result = shouldBlockCrawler(userAgent);
+
+  if (result.isBlocked) {
+    return new Response('Access denied', {
+      status: 403,
+      headers: { 'X-Blocked-Reason': 'AI crawler detected' }
+    });
+  }
+
+  return resolve(event);
+};
+```
+
+### Next.js (App Router)
+
+```typescript
+// middleware.ts
+import { shouldBlockCrawler, extractUserAgent } from 'crawl-me-not';
+import { NextRequest, NextResponse } from 'next/server';
+
+export function middleware(request: NextRequest) {
+  const userAgent = extractUserAgent(request.headers);
+  const result = shouldBlockCrawler(userAgent);
+
+  if (result.isBlocked) {
+    return NextResponse.json(
+      { error: 'Access denied' },
+      { status: 403 }
+    );
+  }
+
+  return NextResponse.next();
+}
+```
+
+### Next.js (Pages Router)
+
+```typescript
+// pages/api/[...all].ts
+import { shouldBlockCrawler, extractUserAgent } from 'crawl-me-not';
+import type { NextApiRequest, NextApiResponse } from 'next';
+
+export default function handler(req: NextApiRequest, res: NextApiResponse) {
+  const userAgent = extractUserAgent(req.headers);
+  const result = shouldBlockCrawler(userAgent);
+
+  if (result.isBlocked) {
+    return res.status(403).json({ error: 'Access denied' });
+  }
+
+  // Continue with your API logic
+  res.status(200).json({ message: 'Hello World' });
+}
+```
+
+### Fastify
+
+```typescript
+import Fastify from 'fastify';
+import { shouldBlockCrawler, extractUserAgent } from 'crawl-me-not';
+
+const fastify = Fastify();
+
+fastify.addHook('preHandler', async (request, reply) => {
+  const userAgent = extractUserAgent(request.headers);
+  const result = shouldBlockCrawler(userAgent);
+
+  if (result.isBlocked) {
+    reply.status(403).send({ error: 'Access denied' });
+    return;
+  }
+});
+```
+
+### Vanilla Node.js
+
+```typescript
+import http from 'node:http';
+import { shouldBlockCrawler, extractUserAgent } from 'crawl-me-not';
+
+const server = http.createServer((req, res) => {
+  const userAgent = extractUserAgent(req.headers);
+  const result = shouldBlockCrawler(userAgent);
+
+  if (result.isBlocked) {
+    res.statusCode = 403;
+    res.setHeader('Content-Type', 'application/json');
+    res.end(JSON.stringify({ error: 'Access denied' }));
+    return;
+  }
+
+  // Your normal request handling
+  res.statusCode = 200;
+  res.end('Hello World!');
+});
+```
+
+### Bun
+
+```typescript
+import { shouldBlockCrawler, extractUserAgent } from 'crawl-me-not';
+
+Bun.serve({
+  fetch(request) {
+    const userAgent = extractUserAgent(request.headers);
+    const result = shouldBlockCrawler(userAgent);
+
+    if (result.isBlocked) {
+      return new Response('Access denied', { status: 403 });
+    }
+
+    return new Response('Hello World!');
+  },
+});
+```
+
+## Advanced Usage
+
+### Custom Configuration
+
+```typescript
+const result = shouldBlockCrawler(userAgent, {
+  blockAI: true,
+  blockSEO: false,
+  customBlocked: [
+    /badbot/i,           // Block anything with "badbot"
+    'unwanted-crawler',  // Block exact string match
+    /scraper.*v[0-9]/i   // Block scraper versions
+  ],
+  whitelist: [
+    /goodbot/i,          // Always allow "goodbot"
+    'monitoring-service' // Always allow this service
+  ],
+  message: 'Custom blocking message',
+  statusCode: 429,
+  headers: {
+    'X-Blocked-Reason': 'Automated traffic detected',
+    'Retry-After': '3600'
+  },
+  debug: true
+});
+```
+
+### Manual Detection (Non-blocking)
+
+```typescript
+import { shouldBlockCrawler, extractUserAgent } from 'crawl-me-not';
+
+// Just detect, don't block
+const userAgent = extractUserAgent(request.headers);
+const result = shouldBlockCrawler(userAgent, { blockAI: false, blockSEO: false });
+
+// Log crawler activity
+if (result.crawlerType) {
+  console.log(`Detected ${result.crawlerType} crawler:`, result.userAgent);
+}
+
+// Apply custom logic
+if (result.crawlerType === 'ai' && isRateLimited(request)) {
+  return blockResponse();
+}
+```
+
+### Rate Limiting for Crawlers
+
+```typescript
+const crawlerLimits = new Map();
+
+app.use((req, res, next) => {
+  const userAgent = extractUserAgent(req.headers);
+  const result = shouldBlockCrawler(userAgent, { blockAI: false });
+
+  if (result.crawlerType === 'ai') {
+    const ip = req.ip;
+    const now = Date.now();
+    const limit = crawlerLimits.get(ip) || { count: 0, resetTime: now + 60000 };
+
+    if (now > limit.resetTime) {
+      limit.count = 0;
+      limit.resetTime = now + 60000;
+    }
+
+    limit.count++;
+    crawlerLimits.set(ip, limit);
+
+    if (limit.count > 10) {
+      return res.status(429).json({ error: 'Rate limit exceeded' });
+    }
+  }
+
+  next();
+});
+```
+
+## Known Crawlers
+
+### AI Crawlers (Detected by default)
+
+- **OpenAI**: GPTBot, ChatGPT-User
+- **Google AI**: Google-Extended, GoogleOther  
+- **Anthropic**: Claude-Web, ClaudeBot
+- **Meta/Facebook**: FacebookBot, Meta-ExternalAgent
+- **ByteDance**: Bytespider, ByteDance
+- **Others**: CCBot, PerplexityBot, YouBot, AI2Bot, cohere-ai
+- **Generic patterns**: python-requests, curl, wget, scrapy, etc.
+
+### SEO Crawlers (Detected but allowed by default)
+
+- **Search Engines**: Googlebot, Bingbot, YandexBot, Baiduspider, DuckDuckBot
+- **SEO Tools**: AhrefsBot, SemrushBot, MJ12bot, DotBot
+- **Social Media**: facebookexternalhit, Twitterbot, LinkedInBot, WhatsApp
+
+## API Reference
+
+### `shouldBlockCrawler(userAgent: string, config?: CrawlerConfig): CrawlerDetectionResult`
+
+Main function to check if a user agent should be blocked.
+
+**Returns:**
+```typescript
+interface CrawlerDetectionResult {
+  isBlocked: boolean;                    // Whether the crawler should be blocked
+  crawlerType: 'ai' | 'seo' | 'custom' | null; // Type of crawler detected
+  userAgent: string;                     // The original user agent string
+  matchedPattern?: string | RegExp;      // Pattern that matched (if blocked)
+}
+```
+
+### `extractUserAgent(headers: HeadersLike): string`
+
+Utility function to extract user agent from various header formats.
+
+**Supports:**
+- Express-style headers: `{ 'user-agent': 'string' }`
+- Web API Headers: `headers.get('user-agent')`
+- Node.js IncomingMessage: `req.headers['user-agent']`
+
+### `detectCrawlerType(userAgent: string): 'ai' | 'seo' | null`
+
+Detect what type of crawler the user agent represents without blocking logic.
+
+### Constants
+
+- `AI_CRAWLER_PATTERNS`: Array of patterns for AI crawlers
+- `SEO_CRAWLER_PATTERNS`: Array of patterns for SEO crawlers  
+- `DEFAULT_CONFIG`: Default configuration object
+
+## Contributing
+
+Contributions are welcome! Please feel free to submit issues and pull requests.
+
+## License
+
+MIT © [Your Name]
+
+## Changelog
+
+### 1.0.0
+- Initial release
+- Framework-agnostic design
+- Comprehensive AI crawler detection
+- Optional SEO crawler detection
+- TypeScript support
+- Zero dependencies 
--- a/examples/express.js
+++ b/examples/express.js
@ -0,0 +1,39 @@
+const express = require('express');
+const { shouldBlockCrawler, extractUserAgent } = require('crawl-me-not');
+
+const app = express();
+
+// Middleware to block AI crawlers
+app.use((req, res, next) => {
+  const userAgent = extractUserAgent(req.headers);
+  const result = shouldBlockCrawler(userAgent, {
+    blockAI: true,
+    blockSEO: false,
+    debug: true
+  });
+
+  if (result.isBlocked) {
+    console.log(`Blocked ${result.crawlerType} crawler: ${result.userAgent}`);
+    return res.status(403).json({
+      error: 'Access denied',
+      reason: `${result.crawlerType} crawler detected`,
+      userAgent: result.userAgent
+    });
+  }
+
+  next();
+});
+
+app.get('/', (req, res) => {
+  res.json({ message: 'Hello World! AI crawlers are blocked.' });
+});
+
+app.get('/api/data', (req, res) => {
+  res.json({ data: 'This API is protected from AI crawlers' });
+});
+
+const port = process.env.PORT || 3000;
+app.listen(port, () => {
+  console.log(`Server running on port ${port}`);
+  console.log('AI crawlers will receive a 403 response');
+}); 
--- a/examples/nextjs-middleware.ts
+++ b/examples/nextjs-middleware.ts
@ -0,0 +1,46 @@
+import { shouldBlockCrawler, extractUserAgent } from 'crawl-me-not';
+import { NextRequest, NextResponse } from 'next/server';
+
+export function middleware(request: NextRequest) {
+  const userAgent = extractUserAgent(request.headers);
+  const result = shouldBlockCrawler(userAgent, {
+    blockAI: true,
+    blockSEO: false,
+    customBlocked: [/scrapy/i, /curl/i], // Block additional patterns
+    debug: true
+  });
+
+  if (result.isBlocked) {
+    console.log(`Blocked ${result.crawlerType} crawler: ${result.userAgent}`);
+    
+    return NextResponse.json(
+      { 
+        error: 'Access denied',
+        reason: `${result.crawlerType} crawler detected`,
+        userAgent: result.userAgent
+      },
+      { 
+        status: 403,
+        headers: {
+          'X-Blocked-Reason': 'Automated traffic detected'
+        }
+      }
+    );
+  }
+
+  return NextResponse.next();
+}
+
+// Configure which paths to run middleware on
+export const config = {
+  matcher: [
+    /*
+     * Match all request paths except for the ones starting with:
+     * - api (API routes)
+     * - _next/static (static files)
+     * - _next/image (image optimization files)
+     * - favicon.ico (favicon file)
+     */
+    '/((?!api|_next/static|_next/image|favicon.ico).*)',
+  ],
+}; 
--- a/examples/sveltekit-hooks.server.ts
+++ b/examples/sveltekit-hooks.server.ts
@ -0,0 +1,32 @@
+import { shouldBlockCrawler, extractUserAgent } from 'crawl-me-not';
+import type { Handle } from '@sveltejs/kit';
+
+export const handle: Handle = async ({ event, resolve }) => {
+  const userAgent = extractUserAgent(event.request.headers);
+  const result = shouldBlockCrawler(userAgent, {
+    blockAI: true,
+    blockSEO: false,
+    debug: true
+  });
+
+  if (result.isBlocked) {
+    console.log(`Blocked ${result.crawlerType} crawler: ${result.userAgent}`);
+    return new Response('Access denied', {
+      status: 403,
+      headers: {
+        'Content-Type': 'application/json',
+        'X-Blocked-Reason': `${result.crawlerType} crawler detected`
+      }
+    });
+  }
+
+  return resolve(event);
+};
+
+// If you need to compose multiple handles:
+// import { sequence } from '@sveltejs/kit/hooks';
+// 
+// export const handle = sequence(
+//   crawlerBlockingHandle,
+//   // your other handles...
+// ); 
--- a/examples/vanilla-node.js
+++ b/examples/vanilla-node.js
@ -0,0 +1,39 @@
+const http = require('node:http');
+const { shouldBlockCrawler, extractUserAgent } = require('crawl-me-not');
+
+const server = http.createServer((req, res) => {
+  const userAgent = extractUserAgent(req.headers);
+  const result = shouldBlockCrawler(userAgent, {
+    blockAI: true,
+    blockSEO: false,
+    debug: true
+  });
+
+  if (result.isBlocked) {
+    console.log(`Blocked ${result.crawlerType} crawler: ${result.userAgent}`);
+    
+    res.statusCode = 403;
+    res.setHeader('Content-Type', 'application/json');
+    res.setHeader('X-Blocked-Reason', 'AI crawler detected');
+    res.end(JSON.stringify({
+      error: 'Access denied',
+      reason: `${result.crawlerType} crawler detected`,
+      userAgent: result.userAgent
+    }));
+    return;
+  }
+
+  // Normal request handling
+  res.statusCode = 200;
+  res.setHeader('Content-Type', 'application/json');
+  res.end(JSON.stringify({
+    message: 'Hello World!',
+    timestamp: new Date().toISOString()
+  }));
+});
+
+const port = process.env.PORT || 3000;
+server.listen(port, () => {
+  console.log(`Server running on port ${port}`);
+  console.log('AI crawlers will receive a 403 response');
+}); 
--- a/jest.config.js
+++ b/jest.config.js
@ -0,0 +1,21 @@
+module.exports = {
+  preset: 'ts-jest',
+  testEnvironment: 'node',
+  roots: ['<rootDir>/src'],
+  testMatch: ['**/__tests__/**/*.ts', '**/?(*.)+(spec|test).ts'],
+  transform: {
+    '^.+\\.ts$': ['ts-jest', {
+      useESM: true,
+    }],
+  },
+  collectCoverageFrom: [
+    'src/**/*.ts',
+    '!src/**/*.d.ts',
+  ],
+  coverageDirectory: 'coverage',
+  coverageReporters: ['text', 'lcov', 'html'],
+  moduleNameMapper: {
+    '^(\\.{1,2}/.*)\\.js$': '$1',
+  },
+  extensionsToTreatAsEsm: ['.ts'],
+}; 
--- a/package-lock.json
+++ b/package-lock.json
--- a/package.json
+++ b/package.json
@ -0,0 +1,57 @@
+{
+  "name": "@silentsilas/crawl-me-not",
+  "version": "1.0.0",
+  "description": "Detect and block AI crawlers and SEO crawlers from any web server or framework",
+  "main": "dist/index.js",
+  "types": "dist/index.d.ts",
+  "files": [
+    "dist",
+    "README.md",
+    "LICENSE"
+  ],
+  "scripts": {
+    "build": "tsc",
+    "dev": "tsc --watch",
+    "lint": "eslint src/**/*.ts",
+    "lint:fix": "eslint src/**/*.ts --fix",
+    "test": "jest",
+    "prepublishOnly": "npm run build",
+    "clean": "rm -rf dist"
+  },
+  "keywords": [
+    "crawler",
+    "bot",
+    "ai",
+    "seo",
+    "middleware",
+    "robots",
+    "scraping",
+    "protection",
+    "user-agent",
+    "detection",
+    "framework-agnostic"
+  ],
+  "author": "Silas",
+  "license": "MIT",
+  "repository": {
+    "type": "git",
+    "url": "git@git.silentsilas.com:silentsilas/crawl-me-not.git"
+  },
+  "bugs": {
+    "url": "https://git.silentsilas.com/silentsilas/crawl-me-not/issues"
+  },
+  "homepage": "https://git.silentsilas.com/silentsilas/crawl-me-not#readme",
+  "devDependencies": {
+    "@types/jest": "^29.5.8",
+    "@types/node": "^20.9.0",
+    "@typescript-eslint/eslint-plugin": "^6.12.0",
+    "@typescript-eslint/parser": "^6.12.0",
+    "eslint": "^8.54.0",
+    "jest": "^29.7.0",
+    "ts-jest": "^29.1.1",
+    "typescript": "^5.2.2"
+  },
+  "engines": {
+    "node": ">=16.0.0"
+  }
+} 
--- a/src/core.test.ts
+++ b/src/core.test.ts
@ -0,0 +1,58 @@
+import { shouldBlockCrawler, extractUserAgent } from './core.js';
+
+describe('shouldBlockCrawler', () => {
+  test('should block AI crawlers by default', () => {
+    const result = shouldBlockCrawler('GPTBot/1.0');
+    expect(result.isBlocked).toBe(true);
+    expect(result.crawlerType).toBe('ai');
+  });
+
+  test('should not block SEO crawlers by default', () => {
+    const result = shouldBlockCrawler('Googlebot/2.1');
+    expect(result.isBlocked).toBe(false);
+    expect(result.crawlerType).toBe('seo');
+  });
+
+  test('should respect whitelist', () => {
+    const result = shouldBlockCrawler('GPTBot/1.0', {
+      whitelist: [/GPTBot/i]
+    });
+    expect(result.isBlocked).toBe(false);
+  });
+
+  test('should block custom patterns', () => {
+    const result = shouldBlockCrawler('CustomBot/1.0', {
+      customBlocked: [/CustomBot/i]
+    });
+    expect(result.isBlocked).toBe(true);
+    expect(result.crawlerType).toBe('custom');
+  });
+
+  test('should allow regular browsers', () => {
+    const result = shouldBlockCrawler('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36');
+    expect(result.isBlocked).toBe(false);
+    expect(result.crawlerType).toBe(null);
+  });
+});
+
+describe('extractUserAgent', () => {
+  test('should extract from Express-style headers', () => {
+    const headers = { 'user-agent': 'TestBot/1.0' };
+    const userAgent = extractUserAgent(headers);
+    expect(userAgent).toBe('TestBot/1.0');
+  });
+
+  test('should extract from SvelteKit-style headers', () => {
+    const headers = {
+      get: (name: string) => name === 'user-agent' ? 'TestBot/1.0' : null
+    };
+    const userAgent = extractUserAgent(headers);
+    expect(userAgent).toBe('TestBot/1.0');
+  });
+
+  test('should handle missing user agent', () => {
+    const headers = {};
+    const userAgent = extractUserAgent(headers);
+    expect(userAgent).toBe('');
+  });
+}); 
--- a/src/core.ts
+++ b/src/core.ts
@ -0,0 +1,130 @@
+import { CrawlerConfig, CrawlerDetectionResult, HeadersLike } from './types.js';
+import { AI_CRAWLER_PATTERNS, SEO_CRAWLER_PATTERNS, matchesPatterns, detectCrawlerType } from './crawlers.js';
+
+/**
+ * Default configuration for crawler blocking
+ */
+export const DEFAULT_CONFIG: Required<CrawlerConfig> = {
+  blockAI: true,
+  blockSEO: false,
+  message: 'Access denied',
+  statusCode: 403,
+  customBlocked: [],
+  whitelist: [],
+  headers: {},
+  debug: false,
+};
+
+/**
+ * Merge user config with default config
+ */
+export function mergeConfig(userConfig: CrawlerConfig = {}): Required<CrawlerConfig> {
+  return {
+    ...DEFAULT_CONFIG,
+    ...userConfig,
+    customBlocked: [...DEFAULT_CONFIG.customBlocked, ...(userConfig.customBlocked || [])],
+    whitelist: [...DEFAULT_CONFIG.whitelist, ...(userConfig.whitelist || [])],
+    headers: { ...DEFAULT_CONFIG.headers, ...(userConfig.headers || {}) },
+  };
+}
+
+/**
+ * Log debug information if debug mode is enabled
+ */
+function debug(config: Required<CrawlerConfig>, message: string): void {
+  if (config.debug) {
+    // eslint-disable-next-line no-console
+    console.log(`[crawl-me-not] ${message}`);
+  }
+}
+
+/**
+ * Check if a user agent should be blocked based on the configuration
+ */
+export function shouldBlockCrawler(userAgent: string, config: CrawlerConfig = {}): CrawlerDetectionResult {
+  const mergedConfig = mergeConfig(config);
+  
+  // Default result
+  const result: CrawlerDetectionResult = {
+    isBlocked: false,
+    crawlerType: null,
+    userAgent,
+  };
+
+  debug(mergedConfig, `Checking user agent: ${userAgent}`);
+
+  // Check whitelist first (takes precedence)
+  const whitelistMatch = matchesPatterns(userAgent, mergedConfig.whitelist);
+  if (whitelistMatch.match) {
+    debug(mergedConfig, `User agent whitelisted by pattern: ${String(whitelistMatch.pattern)}`);
+    return result;
+  }
+
+  // Check custom blocked patterns
+  const customMatch = matchesPatterns(userAgent, mergedConfig.customBlocked);
+  if (customMatch.match && customMatch.pattern) {
+    debug(mergedConfig, `User agent blocked by custom pattern: ${String(customMatch.pattern)}`);
+    return {
+      ...result,
+      isBlocked: true,
+      crawlerType: 'custom',
+      matchedPattern: customMatch.pattern,
+    };
+  }
+
+  // Detect crawler type
+  const crawlerType = detectCrawlerType(userAgent);
+  
+  if (crawlerType === 'ai' && mergedConfig.blockAI) {
+    const aiMatch = matchesPatterns(userAgent, AI_CRAWLER_PATTERNS);
+    if (aiMatch.pattern) {
+      debug(mergedConfig, `AI crawler blocked: ${String(aiMatch.pattern)}`);
+      return {
+        ...result,
+        isBlocked: true,
+        crawlerType: 'ai',
+        matchedPattern: aiMatch.pattern,
+      };
+    }
+  }
+
+  if (crawlerType === 'seo' && mergedConfig.blockSEO) {
+    const seoMatch = matchesPatterns(userAgent, SEO_CRAWLER_PATTERNS);
+    if (seoMatch.pattern) {
+      debug(mergedConfig, `SEO crawler blocked: ${String(seoMatch.pattern)}`);
+      return {
+        ...result,
+        isBlocked: true,
+        crawlerType: 'seo',
+        matchedPattern: seoMatch.pattern,
+      };
+    }
+  }
+
+  debug(mergedConfig, 'User agent allowed');
+  return {
+    ...result,
+    crawlerType,
+  };
+}
+
+/**
+ * Extract user agent from various header formats
+ */
+export function extractUserAgent(headers: HeadersLike): string {
+  // Handle Headers-like object (Web API, SvelteKit, etc.)
+  if (typeof (headers as { get(name: string): string | null }).get === 'function') {
+    const headersObj = headers as { get(name: string): string | null };
+    return headersObj.get('user-agent') || headersObj.get('User-Agent') || '';
+  }
+  
+  // Handle regular object (Express, Node.js, etc.)
+  const headersObj = headers as Record<string, string | string[] | undefined>;
+  const userAgent = headersObj['user-agent'] || headersObj['User-Agent'];
+  
+  if (Array.isArray(userAgent)) {
+    return userAgent[0] || '';
+  }
+  
+  return userAgent || '';
+} 
--- a/src/crawlers.ts
+++ b/src/crawlers.ts
@ -0,0 +1,160 @@
+/**
+ * Known AI crawler user agent patterns
+ * These are patterns for bots that are primarily used for AI training, data collection, or content scraping
+ */
+export const AI_CRAWLER_PATTERNS: (string | RegExp)[] = [
+  // OpenAI
+  /GPTBot/i,
+  /ChatGPT-User/i,
+  
+  // Google AI
+  /Google-Extended/i,
+  /GoogleOther/i,
+  
+  // Anthropic
+  /Claude-Web/i,
+  /ClaudeBot/i,
+  
+  // Meta/Facebook AI
+  /FacebookBot/i,
+  /Meta-ExternalAgent/i,
+  
+  // Bytedance/TikTok
+  /Bytespider/i,
+  /ByteDance/i,
+  
+  // Common AI/ML crawlers
+  /CCBot/i,
+  /anthropic-ai/i,
+  /PerplexityBot/i,
+  /YouBot/i,
+  /ChatGPT/i,
+  /GPT/i,
+  /OpenAI/i,
+  /AI2Bot/i,
+  /cohere-ai/i,
+  
+  // Academic/Research crawlers often used for AI training
+  /ArchiveBot/i,
+  /Internet Archive/i,
+  /archive\.org/i,
+  
+  // Content scrapers and data collectors
+  /DataForSeoBot/i,
+  /SemrushBot/i,
+  /AhrefsBot/i,
+  /MJ12bot/i,
+  /DotBot/i,
+  /CommonCrawl/i,
+  /webzio/i,
+  /Scrapy/i,
+  /scrapy/i,
+  /python-requests/i,
+  /python-urllib/i,
+  /curl/i,
+  /wget/i,
+  /HTTPie/i,
+  /Postman/i,
+  /Insomnia/i,
+  
+  // Generic AI/bot patterns
+  /bot.*ai/i,
+  /ai.*bot/i,
+  /crawler/i,
+  /scraper/i,
+  /spider/i,
+];
+
+/**
+ * Known SEO crawler user agent patterns
+ * These are legitimate crawlers used for SEO analysis and website monitoring
+ */
+export const SEO_CRAWLER_PATTERNS: (string | RegExp)[] = [
+  // Google SEO tools
+  /Googlebot/i,
+  /Google-Site-Verification/i,
+  /Google-InspectionTool/i,
+  
+  // Bing
+  /Bingbot/i,
+  /BingPreview/i,
+  /msnbot/i,
+  
+  // Yandex
+  /YandexBot/i,
+  /YandexImages/i,
+  /YandexMetrika/i,
+  
+  // Baidu
+  /Baiduspider/i,
+  /Baidu/i,
+  
+  // DuckDuckGo
+  /DuckDuckBot/i,
+  /DuckDuckGo/i,
+  
+  // SEO tools
+  /AhrefsBot/i,
+  /SemrushBot/i,
+  /MJ12bot/i,
+  /DotBot/i,
+  /MegaIndex/i,
+  /BacklinkCrawler/i,
+  /SEOkicks/i,
+  /sistrix/i,
+  /BLEXBot/i,
+  
+  // Social media crawlers
+  /facebookexternalhit/i,
+  /Twitterbot/i,
+  /LinkedInBot/i,
+  /WhatsApp/i,
+  /TelegramBot/i,
+  /SkypeUriPreview/i,
+  /Slackbot/i,
+  /Discordbot/i,
+  
+  // Other search engines
+  /Yahoo/i,
+  /Slurp/i,
+  /Ask Jeeves/i,
+  /Teoma/i,
+  /ia_archiver/i,
+  /Wayback/i,
+];
+
+/**
+ * Check if a user agent matches any pattern in the given list
+ */
+export function matchesPatterns(userAgent: string, patterns: (string | RegExp)[]): { match: boolean; pattern?: string | RegExp } {
+  const lowerUserAgent = userAgent.toLowerCase();
+  
+  for (const pattern of patterns) {
+    if (pattern instanceof RegExp) {
+      if (pattern.test(userAgent)) {
+        return { match: true, pattern };
+      }
+    } else if (lowerUserAgent.includes(pattern.toLowerCase())) {
+      return { match: true, pattern };
+    }
+  }
+  
+  return { match: false };
+}
+
+/**
+ * Detect the type of crawler based on user agent
+ */
+export function detectCrawlerType(userAgent: string): 'ai' | 'seo' | null {
+  const aiMatch = matchesPatterns(userAgent, AI_CRAWLER_PATTERNS);
+  if (aiMatch.match) {
+    return 'ai';
+  }
+  
+  const seoMatch = matchesPatterns(userAgent, SEO_CRAWLER_PATTERNS);
+  if (seoMatch.match) {
+    return 'seo';
+  }
+  
+  return null;
+} 
--- a/src/index.ts
+++ b/src/index.ts
@ -0,0 +1,24 @@
+// Core functionality
+export {
+  shouldBlockCrawler,
+  extractUserAgent,
+  mergeConfig,
+  DEFAULT_CONFIG,
+} from './core.js';
+
+// Crawler patterns and detection
+export {
+  AI_CRAWLER_PATTERNS,
+  SEO_CRAWLER_PATTERNS,
+  matchesPatterns,
+  detectCrawlerType,
+} from './crawlers.js';
+
+// Types
+export type {
+  CrawlerConfig,
+  CrawlerDetectionResult,
+} from './types.js';
+
+// Default export for convenience
+export { shouldBlockCrawler as default } from './core.js'; 
--- a/src/types.ts
+++ b/src/types.ts
@ -0,0 +1,30 @@
+export interface CrawlerConfig {
+  /** Block AI crawlers (default: true) */
+  blockAI?: boolean;
+  /** Block SEO crawlers (default: false) */
+  blockSEO?: boolean;
+  /** Custom response message when blocking crawlers (default: "Access denied") */
+  message?: string;
+  /** HTTP status code to return when blocking (default: 403) */
+  statusCode?: number;
+  /** Additional user agents to block (regex patterns) */
+  customBlocked?: (string | RegExp)[];
+  /** User agents to always allow (takes precedence over blocking) */
+  whitelist?: (string | RegExp)[];
+  /** Custom response headers to set when blocking */
+  headers?: Record<string, string>;
+  /** Enable debug logging (default: false) */
+  debug?: boolean;
+}
+
+export interface CrawlerDetectionResult {
+  isBlocked: boolean;
+  crawlerType: 'ai' | 'seo' | 'custom' | null;
+  userAgent: string;
+  matchedPattern?: string | RegExp;
+}
+
+// Header types for different environments
+export type HeadersLike = 
+  | Record<string, string | string[] | undefined>  // Express-style
+  | { get(name: string): string | null };          // Web API Headers-style 
--- a/tsconfig.json
+++ b/tsconfig.json
@ -0,0 +1,31 @@
+{
+  "compilerOptions": {
+    "target": "ES2020",
+    "module": "commonjs",
+    "lib": ["ES2020", "dom"],
+    "outDir": "./dist",
+    "rootDir": "./src",
+    "strict": true,
+    "esModuleInterop": true,
+    "skipLibCheck": true,
+    "forceConsistentCasingInFileNames": true,
+    "declaration": true,
+    "declarationMap": true,
+    "sourceMap": true,
+    "removeComments": false,
+    "noImplicitAny": true,
+    "noImplicitReturns": true,
+    "noImplicitThis": true,
+    "noUnusedLocals": true,
+    "noUnusedParameters": true,
+    "exactOptionalPropertyTypes": true,
+    "noImplicitOverride": true,
+    "noPropertyAccessFromIndexSignature": true,
+    "noUncheckedIndexedAccess": true,
+    "moduleResolution": "node",
+    "resolveJsonModule": true,
+    "allowSyntheticDefaultImports": true
+  },
+  "include": ["src/**/*"],
+  "exclude": ["node_modules", "dist", "**/*.test.ts", "**/*.spec.ts", "examples"]
+}