add fuzzy search and combine it with semantic search results, update readme, change license file extension

2024-06-10 20:51:35 -04:00 · 2024-06-10 20:51:35 -04:00 · 3320683d10
parent 3242c3bc77
commit 3320683d10
13 changed files with 176 additions and 88 deletions
--- a/LICENSE.md
+++ b/LICENSE.md
--- a/README.md
+++ b/README.md
@ -35,6 +35,6 @@ It will traverse through every `*.md` under `src/posts/poetry` and generate the

 ## License

-This project is licensed under the [MIT License](src/branch/main/LICENSE.md). This means that you can do pretty much anything you want with this code as long as you include the original copyright and license notice in your project.
+This project is licensed under the [MIT License](src/branch/main/LICENSE). This means that you can do pretty much anything you want with this code as long as you include the original copyright and license notice in your project.

-Content contained in the `src/posts` folder is under the [CC BY-NC-SA-4.0](src/branch/main/src/posts/LICENSE.md) license. You're free to modify/distribute the posts contained in the `src/posts` folder so long as it's not for commercial purposes, you give attribution, and point out any modifications you've made.
+Content contained in the `src/posts` folder is under the [CC BY-NC-SA-4.0](src/branch/main/src/posts/LICENSE) license. You're free to modify/distribute the posts contained in the `src/posts` folder so long as it's not for commercial purposes, you give attribution, and point out any modifications you've made.
--- a/package-lock.json
+++ b/package-lock.json
@ -14,6 +14,7 @@
        "@threlte/core": "^7.3.0",
        "@threlte/extras": "^8.11.2",
        "@threlte/rapier": "^2.0.0",
+        "fuse.js": "^7.0.0",
        "marked": "^12.0.2",
        "mdsvex": "^0.11.0",
        "three": "^0.159.0"
@ -3427,6 +3428,14 @@
        "url": "https://github.com/sponsors/ljharb"
      }
    },
+    "node_modules/fuse.js": {
+      "version": "7.0.0",
+      "resolved": "https://registry.npmjs.org/fuse.js/-/fuse.js-7.0.0.tgz",
+      "integrity": "sha512-14F4hBIxqKvD4Zz/XjDc3y94mNZN6pRv3U13Udo0lNLCWRBUsrMv2xwcF/y/Z5sV6+FQW+/ow68cHpm4sunt8Q==",
+      "engines": {
+        "node": ">=10"
+      }
+    },
    "node_modules/gauge": {
      "version": "3.0.2",
      "resolved": "https://registry.npmjs.org/gauge/-/gauge-3.0.2.tgz",
--- a/package.json
+++ b/package.json
@ -52,6 +52,7 @@
    "@threlte/core": "^7.3.0",
    "@threlte/extras": "^8.11.2",
    "@threlte/rapier": "^2.0.0",
+    "fuse.js": "^7.0.0",
    "marked": "^12.0.2",
    "mdsvex": "^0.11.0",
    "three": "^0.159.0"
--- a/scripts/generate-embeddings.js
+++ b/scripts/generate-embeddings.js
@ -8,32 +8,42 @@ import { marked } from 'marked';

 async function extractTextFromMarkdown(filePath) {
    const markdown = await fs.readFile(filePath, 'utf8');
-    return marked(markdown).replace(/<[^>]*>/g, ''); // Strip HTML tags generated by marked
+		// remove yaml frontmatter metadata
+		const result = markdown.replace(/---[\s\S]*?---/gm, '');
+		// remove html tags
+		const text = marked(result).replace(/<[^>]*>/g, '');
+		return text;
 }

 async function generateEmbeddingsForDirectory(directoryPath) {
    // Get all markdown files in directory
-    const files = glob.sync(`${directoryPath}/*.md`);
+    const files = glob.sync(`${directoryPath}/**/*.md`, {
+			ignore: [`${directoryPath}/LICENSE.md`]
+		});

    // Extract texts from markdown files
    // eslint-disable-next-line @typescript-eslint/no-unused-vars
-    const poems = await Promise.all(files.map(async (file, _index) => ({
-        id: path.basename(file, '.md'),  // Use filename as ID
-        text: await extractTextFromMarkdown(file)
+    const posts = await Promise.all(files.map(async (file, _index) => ({
+        id: path.basename(file, '.md'),  // Use filename as ID,
+        text: await extractTextFromMarkdown(file),
+				section: path.basename(path.dirname(file)),
+				filename: path.basename(file)
    })));

    // Load the Universal Sentence Encoder model
    const model = await use.load();
-    const embeddings = await Promise.all(poems.map(poem => model.embed([poem.text])));
+    const embeddings = await Promise.all(posts.map(post => model.embed([post.text])));

-    // Map embeddings back to poem objects
-    const poemEmbeddings = poems.map((poem, index) => ({
-        id: poem.id,
-        vector: embeddings[index].arraySync()[0]  // Extract the vector
+    // Map embeddings back to post objects
+    const poemEmbeddings = posts.map((post, index) => ({
+        id: post.id,
+        vector: embeddings[index].arraySync()[0],  // Extract the vector
+				section: post.section,
+				filename: post.id
    }));

    // Save embeddings to JSON file
    fs.writeJson('embeddings.json', poemEmbeddings);
 }

-generateEmbeddingsForDirectory('src/posts/poetry');  // Update path accordingly
+generateEmbeddingsForDirectory('src/posts');  // Update path accordingly
--- a/src/lib/components/NavBar.svelte
+++ b/src/lib/components/NavBar.svelte
@ -2,18 +2,22 @@
 	import type { SearchResult } from '$lib/utils/search';
 	import { searchResults } from '$lib/store';

-	let searchQuery = '';
+	let timer: NodeJS.Timeout | undefined;

-	async function handleSearch() {
-		// const section = window.location.pathname.split('/')[1];
-		const response = await fetch(`/api/poetry/search?q=${encodeURIComponent(searchQuery)}`);
-		if (response.ok) {
-			const data: SearchResult[] = await response.json();
-			searchResults.set(data);
-		} else {
-			console.error('Failed to fetch search results');
-			searchResults.set([]);
-		}
+	async function handleSearch({ target }: Event) {
+		const { value } = target as HTMLInputElement;
+		clearTimeout(timer);
+		timer = setTimeout(async () => {
+			// const section = window.location.pathname.split('/')[1];
+			const response = await fetch(`/api/search?q=${encodeURIComponent(value)}`);
+			if (response.ok) {
+				const data: SearchResult[] = await response.json();
+				searchResults.set(data);
+			} else {
+				console.error('Failed to fetch search results');
+				searchResults.set([]);
+			}
+		}, 300);
 	}
 </script>

@ -54,20 +58,13 @@
 				type="text"
 				placeholder="Search"
 				class="input w-24 md:w-auto"
-				bind:value={searchQuery}
-				on:input={handleSearch}
+				on:keyup={handleSearch}
 			/>
 		</div>
 	</div>
 	<div class="navbar-end hidden lg:flex">
 		<div class="form-control">
-			<input
-				type="text"
-				placeholder="Search"
-				class="input md:w-auto"
-				bind:value={searchQuery}
-				on:input={handleSearch}
-			/>
+			<input type="text" placeholder="Search" class="input md:w-auto" on:keyup={handleSearch} />
 		</div>
 		<ul class="menu menu-horizontal px-1">
 			<li><a href="/thoughts" class="link-primary">Thoughts</a></li>
--- a/src/lib/components/SearchResults.svelte
+++ b/src/lib/components/SearchResults.svelte
@ -31,8 +31,12 @@
 			{#each results as result}
 				<li class="py-4">
 					<h3 class="pb-1">
-						<a class="link" href={`/poetry/${result.poem.id}`}>{slugToTitle(result.poem.id)}</a>
-						<p class="text-sm">(Relevance: {(result.similarity * 100).toFixed(3)}%)</p>
+						<a class="link" href={`/${result.post.section}/${result.post.filename}`} target="_blank"
+							>{slugToTitle(result.post.id)}</a
+						>
+						<p class="text-sm">
+							(Relevance: {(result.similarity * 100).toFixed(2)}%, Section: {result.post.section})
+						</p>
 					</h3>
 				</li>
 			{/each}
--- a/src/lib/utils/index.ts
+++ b/src/lib/utils/index.ts
@ -1,7 +1,6 @@
 export interface Metadata {
 	title: string;
 	date: string;
-	content: string;
 	categories?: string[];
 	draft?: boolean;
 }
@ -9,18 +8,25 @@ export interface Metadata {
 export interface Section {
 	poetry: 'poetry';
 	thoughts: 'thoughts';
-	projects: 'projects';
+	services: 'services';
+	all: 'all';
 }

 type SectionKey = keyof Section;

 export interface Post {
 	meta: Metadata;
-	path: string;
+	content: string;
+	section: string;
+	filename: string;
+	id: string;
 }

 interface Data {
 	metadata: Metadata;
+	default: {
+		render: () => { html: string };
+	};
 }

 function isData(obj: unknown): obj is Data {
@ -39,20 +45,19 @@ function isData(obj: unknown): obj is Data {

 export const fetchMarkdownPosts = async (
 	section: SectionKey,
-	limit: number,
-	offset: number
+	limit?: number,
+	offset?: number
 ): Promise<{ posts: Post[]; total: number }> => {
 	let posts: Record<string, () => Promise<unknown>>;
 	switch (section) {
+		case 'all':
+			posts = import.meta.glob('/src/posts/**/*.md');
+			break;
 		case 'poetry':
 			posts = import.meta.glob('/src/posts/poetry/*.md');
 			break;
-		case 'projects':
-			posts = import.meta.glob('/src/routes/(app)/projects/posts/*.md');
-			break;
 		case 'thoughts':
 			posts = import.meta.glob('/src/posts/thoughts/*.md');
-			console.log(posts);
 			break;
 		default:
 			throw new Error('Could not find this section');
@ -70,10 +75,18 @@ export const fetchMarkdownPosts = async (
 						return undefined;
 					}
 					const { metadata } = data;
-					const postPath = path.slice(11, -3);
+					const { html } = data.default.render();
+					// remove html tags
+					const content = html.replace(/<[^>]*>/g, '');
+					const section = path.split('/')[3];
+					const filename = path.split('/').pop()?.slice(0, -3);
+
 					return {
 						meta: { ...metadata },
-						path: postPath
+						content,
+						section,
+						filename,
+						id: data.metadata.title
 					};
 				} else {
 					console.error('Could not properly parse this post');
@ -94,6 +107,10 @@ export const fetchMarkdownPosts = async (
 			(b, a) => new Date(a?.meta.date || '').getTime() - new Date(b?.meta.date || '').getTime()
 		);

+	if (limit === undefined || offset === undefined) {
+		return { posts: sortedPosts, total: allPosts.length };
+	}
+
 	const paginatedPosts = sortedPosts.slice(offset, offset + limit);

 	return { posts: paginatedPosts, total: allPosts.length };
--- a/src/lib/utils/poetry/embeddings.json
+++ b/src/lib/utils/poetry/embeddings.json
--- a/src/lib/utils/search.ts
+++ b/src/lib/utils/search.ts
@ -8,10 +8,12 @@ import * as tf from '@tensorflow/tfjs-node';
 export type Embedding = {
 	id: string;
 	vector: number[];
+	section: string;
+	filename: string;
 };

 export type SearchResult = {
-	poem: Embedding;
+	post: Embedding;
 	similarity: number;
 };

--- a/src/posts/LICENSE.md
+++ b/src/posts/LICENSE.md
--- a/src/routes/api/poetry/search/+server.ts
+++ b/src/routes/api/poetry/search/+server.ts
@ -1,40 +0,0 @@
-// eslint-disable-next-line
-import * as tf from '@tensorflow/tfjs-node';
-import poemEmbeddings from '$lib/utils/poetry/embeddings.json';
-import { json } from '@sveltejs/kit';
-import { getModel, type Embedding, type SearchResult } from '$lib/utils/search';
-
-// Search handler
-export const GET = async ({ url }: { url: URL }) => {
-	const model = await getModel();
-	const searchQuery = url.searchParams.get('q');
-	if (!searchQuery) {
-		return { status: 400, body: { error: 'Query parameter "q" is required' } };
-	}
-
-	try {
-		// Generate embedding for the query
-		const queryEmbedding = await model.embed([searchQuery]);
-		const queryVec = queryEmbedding.arraySync()[0];
-
-		// Calculate similarities
-		const results = poemEmbeddings
-			.map((poem: Embedding) => ({
-				poem,
-				similarity: cosineSimilarity(queryVec, poem.vector)
-			}))
-			.sort((a: SearchResult, b: SearchResult) => b.similarity - a.similarity)
-			.slice(0, 10); // Top 10 results
-
-		return json(results);
-	} catch (error) {
-		return { status: 500, body: { error: (error as Error).message } };
-	}
-};
-
-function cosineSimilarity(vecA: number[], vecB: number[]) {
-	const dotProduct = vecA.reduce((acc, val, i) => acc + val * vecB[i], 0);
-	const magnitudeA = Math.sqrt(vecA.reduce((acc, val) => acc + val * val, 0));
-	const magnitudeB = Math.sqrt(vecB.reduce((acc, val) => acc + val * val, 0));
-	return dotProduct / (magnitudeA * magnitudeB);
-}
--- a/src/routes/api/search/+server.ts
+++ b/src/routes/api/search/+server.ts
@ -0,0 +1,88 @@
+// eslint-disable-next-line
+import * as tf from '@tensorflow/tfjs-node';
+import postEmbeddings from '$lib/utils/poetry/embeddings.json';
+import { json } from '@sveltejs/kit';
+import { getModel, type Embedding, type SearchResult } from '$lib/utils/search';
+import { fetchMarkdownPosts } from '$lib/utils';
+import Fuse from 'fuse.js';
+
+// Search handler
+export const GET = async ({ url }: { url: URL }) => {
+	const searchQuery = url.searchParams.get('q');
+	if (!searchQuery) {
+		return { status: 400, body: { error: 'Query parameter "q" is required' } };
+	}
+
+	try {
+		const model = await getModel();
+		const { posts } = await fetchMarkdownPosts('all');
+		const fuse = new Fuse(posts, {
+			keys: ['content', 'meta.title', 'meta.tags'],
+			includeScore: true
+		});
+
+		// Fuzzy search
+		const fuzzyResults = fuse.search(searchQuery, { limit: 10 });
+
+		// Generate embedding for the query
+		const queryEmbedding = await model.embed([searchQuery]);
+		const queryVec = queryEmbedding.arraySync()[0];
+
+		// Calculate similarities
+		let semanticResults = postEmbeddings.map((post: Embedding) => ({
+			post,
+			similarity: cosineSimilarity(queryVec, post.vector)
+		}));
+
+		// add fuzzy results to semantic results
+		semanticResults = semanticResults.map((semanticResult) => {
+			const fuzzyResultIndex = fuzzyResults.findIndex(
+				(fuzzyResult) =>
+					fuzzyResult.item.section === semanticResult.post.section &&
+					fuzzyResult.item.filename === semanticResult.post.filename
+			);
+
+			if (fuzzyResultIndex > -1) {
+				const fuzzyResult = fuzzyResults.splice(fuzzyResultIndex, 1)[0];
+
+				if (fuzzyResult.score && 1 - fuzzyResult.score > 0.8) {
+					semanticResult.similarity = 1 - fuzzyResult.score / 2 + semanticResult.similarity;
+				}
+			}
+			return semanticResult;
+		});
+
+		// add rest of fuzzy results
+		semanticResults.push(
+			...fuzzyResults.map((fuzzyResult) => {
+				let similarity = 0;
+				if (fuzzyResult.score && 1 - fuzzyResult.score > 0.9) {
+					similarity = 1 - fuzzyResult.score / 2;
+				}
+				return {
+					post: {
+						id: fuzzyResult.item.id,
+						section: fuzzyResult.item.section,
+						filename: fuzzyResult.item.filename,
+						vector: [0, 0, 0]
+					},
+					similarity: similarity
+				};
+			})
+		);
+		semanticResults = semanticResults
+			.sort((a: SearchResult, b: SearchResult) => b.similarity - a.similarity)
+			.slice(0, 10);
+
+		return json(semanticResults);
+	} catch (error) {
+		return { status: 500, body: { error: (error as Error).message } };
+	}
+};
+
+function cosineSimilarity(vecA: number[], vecB: number[]) {
+	const dotProduct = vecA.reduce((acc, val, i) => acc + val * vecB[i], 0);
+	const magnitudeA = Math.sqrt(vecA.reduce((acc, val) => acc + val * val, 0));
+	const magnitudeB = Math.sqrt(vecB.reduce((acc, val) => acc + val * val, 0));
+	return dotProduct / (magnitudeA * magnitudeB);
+}