add fuzzy search and combine it with semantic search results, update readme, change license file extension

2024-06-10 20:51:35 -04:00
parent 3242c3bc77
commit 3320683d10
13 changed files with 176 additions and 88 deletions
--- a/LICENSE.md
+++ b/LICENSE.md
--- a/README.md
+++ b/README.md
@@ -35,6 +35,6 @@ It will traverse through every `*.md` under `src/posts/poetry` and generate the
 ## License
-This project is licensed under the [MIT License](src/branch/main/LICENSE.md). This means that you can do pretty much anything you want with this code as long as you include the original copyright and license notice in your project.
+This project is licensed under the [MIT License](src/branch/main/LICENSE). This means that you can do pretty much anything you want with this code as long as you include the original copyright and license notice in your project.
-Content contained in the `src/posts` folder is under the [CC BY-NC-SA-4.0](src/branch/main/src/posts/LICENSE.md) license. You're free to modify/distribute the posts contained in the `src/posts` folder so long as it's not for commercial purposes, you give attribution, and point out any modifications you've made.
+Content contained in the `src/posts` folder is under the [CC BY-NC-SA-4.0](src/branch/main/src/posts/LICENSE) license. You're free to modify/distribute the posts contained in the `src/posts` folder so long as it's not for commercial purposes, you give attribution, and point out any modifications you've made.
--- a/package-lock.json
+++ b/package-lock.json
@@ -14,6 +14,7 @@
        "@threlte/core": "^7.3.0",
        "@threlte/extras": "^8.11.2",
        "@threlte/rapier": "^2.0.0",
        "fuse.js": "^7.0.0",
        "marked": "^12.0.2",
        "mdsvex": "^0.11.0",
        "three": "^0.159.0"
@@ -3427,6 +3428,14 @@
        "url": "https://github.com/sponsors/ljharb"
      }
    },
    "node_modules/fuse.js": {
      "version": "7.0.0",
      "resolved": "https://registry.npmjs.org/fuse.js/-/fuse.js-7.0.0.tgz",
      "integrity": "sha512-14F4hBIxqKvD4Zz/XjDc3y94mNZN6pRv3U13Udo0lNLCWRBUsrMv2xwcF/y/Z5sV6+FQW+/ow68cHpm4sunt8Q==",
      "engines": {
        "node": ">=10"
      }
    },
    "node_modules/gauge": {
      "version": "3.0.2",
      "resolved": "https://registry.npmjs.org/gauge/-/gauge-3.0.2.tgz",
--- a/package.json
+++ b/package.json
@@ -52,6 +52,7 @@
    "@threlte/core": "^7.3.0",
    "@threlte/extras": "^8.11.2",
    "@threlte/rapier": "^2.0.0",
    "fuse.js": "^7.0.0",
    "marked": "^12.0.2",
    "mdsvex": "^0.11.0",
    "three": "^0.159.0"
--- a/scripts/generate-embeddings.js
+++ b/scripts/generate-embeddings.js
@@ -8,32 +8,42 @@ import { marked } from 'marked';
 async function extractTextFromMarkdown(filePath) {
    const markdown = await fs.readFile(filePath, 'utf8');
-    return marked(markdown).replace(/<[^>]*>/g, ''); // Strip HTML tags generated by marked
+		// remove yaml frontmatter metadata
 		const result = markdown.replace(/---[\s\S]*?---/gm, '');
 		// remove html tags
 		const text = marked(result).replace(/<[^>]*>/g, '');
 		return text;
 }
 async function generateEmbeddingsForDirectory(directoryPath) {
    // Get all markdown files in directory
-    const files = glob.sync(`${directoryPath}/*.md`);
+    const files = glob.sync(`${directoryPath}/**/*.md`, {
 			ignore: [`${directoryPath}/LICENSE.md`]
 		});
    // Extract texts from markdown files
    // eslint-disable-next-line @typescript-eslint/no-unused-vars
-    const poems = await Promise.all(files.map(async (file, _index) => ({
+    const posts = await Promise.all(files.map(async (file, _index) => ({
-        id: path.basename(file, '.md'),  // Use filename as ID
+        id: path.basename(file, '.md'),  // Use filename as ID,
-        text: await extractTextFromMarkdown(file)
+        text: await extractTextFromMarkdown(file),
 				section: path.basename(path.dirname(file)),
 				filename: path.basename(file)
    })));
    // Load the Universal Sentence Encoder model
    const model = await use.load();
-    const embeddings = await Promise.all(poems.map(poem => model.embed([poem.text])));
+    const embeddings = await Promise.all(posts.map(post => model.embed([post.text])));
-    // Map embeddings back to poem objects
+    // Map embeddings back to post objects
-    const poemEmbeddings = poems.map((poem, index) => ({
+    const poemEmbeddings = posts.map((post, index) => ({
-        id: poem.id,
+        id: post.id,
-        vector: embeddings[index].arraySync()[0]  // Extract the vector
+        vector: embeddings[index].arraySync()[0],  // Extract the vector
 				section: post.section,
 				filename: post.id
    }));
    // Save embeddings to JSON file
    fs.writeJson('embeddings.json', poemEmbeddings);
 }
-generateEmbeddingsForDirectory('src/posts/poetry');  // Update path accordingly
+generateEmbeddingsForDirectory('src/posts');  // Update path accordingly
--- a/src/lib/components/NavBar.svelte
+++ b/src/lib/components/NavBar.svelte
@@ -2,18 +2,22 @@
 	import type { SearchResult } from '$lib/utils/search';
 	import { searchResults } from '$lib/store';
-	let searchQuery = '';
+	let timer: NodeJS.Timeout | undefined;
-	async function handleSearch() {
+	async function handleSearch({ target }: Event) {
-		// const section = window.location.pathname.split('/')[1];
+		const { value } = target as HTMLInputElement;
-		const response = await fetch(`/api/poetry/search?q=${encodeURIComponent(searchQuery)}`);
+		clearTimeout(timer);
-		if (response.ok) {
+		timer = setTimeout(async () => {
-			const data: SearchResult[] = await response.json();
+			// const section = window.location.pathname.split('/')[1];
-			searchResults.set(data);
+			const response = await fetch(`/api/search?q=${encodeURIComponent(value)}`);
-		} else {
+			if (response.ok) {
-			console.error('Failed to fetch search results');
+				const data: SearchResult[] = await response.json();
-			searchResults.set([]);
+				searchResults.set(data);
-		}
+			} else {
 				console.error('Failed to fetch search results');
 				searchResults.set([]);
 			}
 		}, 300);
 	}
 </script>
@@ -54,20 +58,13 @@
 				type="text"
 				placeholder="Search"
 				class="input w-24 md:w-auto"
-				bind:value={searchQuery}
+				on:keyup={handleSearch}
 				on:input={handleSearch}
 			/>
 		</div>
 	</div>
 	<div class="navbar-end hidden lg:flex">
 		<div class="form-control">
-			<input
+			<input type="text" placeholder="Search" class="input md:w-auto" on:keyup={handleSearch} />
 				type="text"
 				placeholder="Search"
 				class="input md:w-auto"
 				bind:value={searchQuery}
 				on:input={handleSearch}
 			/>
 		</div>
 		<ul class="menu menu-horizontal px-1">
 			<li><a href="/thoughts" class="link-primary">Thoughts</a></li>
--- a/src/lib/components/SearchResults.svelte
+++ b/src/lib/components/SearchResults.svelte
@@ -31,8 +31,12 @@
 			{#each results as result}
 				<li class="py-4">
 					<h3 class="pb-1">
-						<a class="link" href={`/poetry/${result.poem.id}`}>{slugToTitle(result.poem.id)}</a>
+						<a class="link" href={`/${result.post.section}/${result.post.filename}`} target="_blank"
-						<p class="text-sm">(Relevance: {(result.similarity * 100).toFixed(3)}%)</p>
+							>{slugToTitle(result.post.id)}</a
 						>
 						<p class="text-sm">
 							(Relevance: {(result.similarity * 100).toFixed(2)}%, Section: {result.post.section})
 						</p>
 					</h3>
 				</li>
 			{/each}
--- a/src/lib/utils/index.ts
+++ b/src/lib/utils/index.ts
@@ -1,7 +1,6 @@
 export interface Metadata {
 	title: string;
 	date: string;
 	content: string;
 	categories?: string[];
 	draft?: boolean;
 }
@@ -9,18 +8,25 @@ export interface Metadata {
 export interface Section {
 	poetry: 'poetry';
 	thoughts: 'thoughts';
-	projects: 'projects';
+	services: 'services';
 	all: 'all';
 }
 type SectionKey = keyof Section;
 export interface Post {
 	meta: Metadata;
-	path: string;
+	content: string;
 	section: string;
 	filename: string;
 	id: string;
 }
 interface Data {
 	metadata: Metadata;
 	default: {
 		render: () => { html: string };
 	};
 }
 function isData(obj: unknown): obj is Data {
@@ -39,20 +45,19 @@ function isData(obj: unknown): obj is Data {
 export const fetchMarkdownPosts = async (
 	section: SectionKey,
-	limit: number,
+	limit?: number,
-	offset: number
+	offset?: number
 ): Promise<{ posts: Post[]; total: number }> => {
 	let posts: Record<string, () => Promise<unknown>>;
 	switch (section) {
 		case 'all':
 			posts = import.meta.glob('/src/posts/**/*.md');
 			break;
 		case 'poetry':
 			posts = import.meta.glob('/src/posts/poetry/*.md');
 			break;
 		case 'projects':
 			posts = import.meta.glob('/src/routes/(app)/projects/posts/*.md');
 			break;
 		case 'thoughts':
 			posts = import.meta.glob('/src/posts/thoughts/*.md');
 			console.log(posts);
 			break;
 		default:
 			throw new Error('Could not find this section');
@@ -70,10 +75,18 @@ export const fetchMarkdownPosts = async (
 						return undefined;
 					}
 					const { metadata } = data;
-					const postPath = path.slice(11, -3);
+					const { html } = data.default.render();
 					// remove html tags
 					const content = html.replace(/<[^>]*>/g, '');
 					const section = path.split('/')[3];
 					const filename = path.split('/').pop()?.slice(0, -3);
 					return {
 						meta: { ...metadata },
-						path: postPath
+						content,
 						section,
 						filename,
 						id: data.metadata.title
 					};
 				} else {
 					console.error('Could not properly parse this post');
@@ -94,6 +107,10 @@ export const fetchMarkdownPosts = async (
 			(b, a) => new Date(a?.meta.date || '').getTime() - new Date(b?.meta.date || '').getTime()
 		);
 	if (limit === undefined || offset === undefined) {
 		return { posts: sortedPosts, total: allPosts.length };
 	}
 	const paginatedPosts = sortedPosts.slice(offset, offset + limit);
 	return { posts: paginatedPosts, total: allPosts.length };
--- a/src/lib/utils/poetry/embeddings.json
+++ b/src/lib/utils/poetry/embeddings.json
--- a/src/lib/utils/search.ts
+++ b/src/lib/utils/search.ts
@@ -8,10 +8,12 @@ import * as tf from '@tensorflow/tfjs-node';
 export type Embedding = {
 	id: string;
 	vector: number[];
 	section: string;
 	filename: string;
 };
 export type SearchResult = {
-	poem: Embedding;
+	post: Embedding;
 	similarity: number;
 };
--- a/src/posts/LICENSE.md
+++ b/src/posts/LICENSE.md
--- a/src/routes/api/poetry/search/+server.ts
+++ b/src/routes/api/poetry/search/+server.ts
@@ -1,40 +0,0 @@
 // eslint-disable-next-line
 import * as tf from '@tensorflow/tfjs-node';
 import poemEmbeddings from '$lib/utils/poetry/embeddings.json';
 import { json } from '@sveltejs/kit';
 import { getModel, type Embedding, type SearchResult } from '$lib/utils/search';
 // Search handler
 export const GET = async ({ url }: { url: URL }) => {
 	const model = await getModel();
 	const searchQuery = url.searchParams.get('q');
 	if (!searchQuery) {
 		return { status: 400, body: { error: 'Query parameter "q" is required' } };
 	}
 	try {
 		// Generate embedding for the query
 		const queryEmbedding = await model.embed([searchQuery]);
 		const queryVec = queryEmbedding.arraySync()[0];
 		// Calculate similarities
 		const results = poemEmbeddings
 			.map((poem: Embedding) => ({
 				poem,
 				similarity: cosineSimilarity(queryVec, poem.vector)
 			}))
 			.sort((a: SearchResult, b: SearchResult) => b.similarity - a.similarity)
 			.slice(0, 10); // Top 10 results
 		return json(results);
 	} catch (error) {
 		return { status: 500, body: { error: (error as Error).message } };
 	}
 };
 function cosineSimilarity(vecA: number[], vecB: number[]) {
 	const dotProduct = vecA.reduce((acc, val, i) => acc + val * vecB[i], 0);
 	const magnitudeA = Math.sqrt(vecA.reduce((acc, val) => acc + val * val, 0));
 	const magnitudeB = Math.sqrt(vecB.reduce((acc, val) => acc + val * val, 0));
 	return dotProduct / (magnitudeA * magnitudeB);
 }
--- a/src/routes/api/search/+server.ts
+++ b/src/routes/api/search/+server.ts
@@ -0,0 +1,88 @@
 // eslint-disable-next-line
 import * as tf from '@tensorflow/tfjs-node';
 import postEmbeddings from '$lib/utils/poetry/embeddings.json';
 import { json } from '@sveltejs/kit';
 import { getModel, type Embedding, type SearchResult } from '$lib/utils/search';
 import { fetchMarkdownPosts } from '$lib/utils';
 import Fuse from 'fuse.js';
 // Search handler
 export const GET = async ({ url }: { url: URL }) => {
 	const searchQuery = url.searchParams.get('q');
 	if (!searchQuery) {
 		return { status: 400, body: { error: 'Query parameter "q" is required' } };
 	}
 	try {
 		const model = await getModel();
 		const { posts } = await fetchMarkdownPosts('all');
 		const fuse = new Fuse(posts, {
 			keys: ['content', 'meta.title', 'meta.tags'],
 			includeScore: true
 		});
 		// Fuzzy search
 		const fuzzyResults = fuse.search(searchQuery, { limit: 10 });
 		// Generate embedding for the query
 		const queryEmbedding = await model.embed([searchQuery]);
 		const queryVec = queryEmbedding.arraySync()[0];
 		// Calculate similarities
 		let semanticResults = postEmbeddings.map((post: Embedding) => ({
 			post,
 			similarity: cosineSimilarity(queryVec, post.vector)
 		}));
 		// add fuzzy results to semantic results
 		semanticResults = semanticResults.map((semanticResult) => {
 			const fuzzyResultIndex = fuzzyResults.findIndex(
 				(fuzzyResult) =>
 					fuzzyResult.item.section === semanticResult.post.section &&
 					fuzzyResult.item.filename === semanticResult.post.filename
 			);
 			if (fuzzyResultIndex > -1) {
 				const fuzzyResult = fuzzyResults.splice(fuzzyResultIndex, 1)[0];
 				if (fuzzyResult.score && 1 - fuzzyResult.score > 0.8) {
 					semanticResult.similarity = 1 - fuzzyResult.score / 2 + semanticResult.similarity;
 				}
 			}
 			return semanticResult;
 		});
 		// add rest of fuzzy results
 		semanticResults.push(
 			...fuzzyResults.map((fuzzyResult) => {
 				let similarity = 0;
 				if (fuzzyResult.score && 1 - fuzzyResult.score > 0.9) {
 					similarity = 1 - fuzzyResult.score / 2;
 				}
 				return {
 					post: {
 						id: fuzzyResult.item.id,
 						section: fuzzyResult.item.section,
 						filename: fuzzyResult.item.filename,
 						vector: [0, 0, 0]
 					},
 					similarity: similarity
 				};
 			})
 		);
 		semanticResults = semanticResults
 			.sort((a: SearchResult, b: SearchResult) => b.similarity - a.similarity)
 			.slice(0, 10);
 		return json(semanticResults);
 	} catch (error) {
 		return { status: 500, body: { error: (error as Error).message } };
 	}
 };
 function cosineSimilarity(vecA: number[], vecB: number[]) {
 	const dotProduct = vecA.reduce((acc, val, i) => acc + val * vecB[i], 0);
 	const magnitudeA = Math.sqrt(vecA.reduce((acc, val) => acc + val * val, 0));
 	const magnitudeB = Math.sqrt(vecB.reduce((acc, val) => acc + val * val, 0));
 	return dotProduct / (magnitudeA * magnitudeB);
 }