add fuzzy search and combine it with semantic search results, update readme, change license file extension

This commit is contained in:
2024-06-10 20:51:35 -04:00
parent 3242c3bc77
commit 3320683d10
13 changed files with 176 additions and 88 deletions

View File

@@ -2,18 +2,22 @@
import type { SearchResult } from '$lib/utils/search';
import { searchResults } from '$lib/store';
let searchQuery = '';
let timer: NodeJS.Timeout | undefined;
async function handleSearch() {
// const section = window.location.pathname.split('/')[1];
const response = await fetch(`/api/poetry/search?q=${encodeURIComponent(searchQuery)}`);
if (response.ok) {
const data: SearchResult[] = await response.json();
searchResults.set(data);
} else {
console.error('Failed to fetch search results');
searchResults.set([]);
}
async function handleSearch({ target }: Event) {
const { value } = target as HTMLInputElement;
clearTimeout(timer);
timer = setTimeout(async () => {
// const section = window.location.pathname.split('/')[1];
const response = await fetch(`/api/search?q=${encodeURIComponent(value)}`);
if (response.ok) {
const data: SearchResult[] = await response.json();
searchResults.set(data);
} else {
console.error('Failed to fetch search results');
searchResults.set([]);
}
}, 300);
}
</script>
@@ -54,20 +58,13 @@
type="text"
placeholder="Search"
class="input w-24 md:w-auto"
bind:value={searchQuery}
on:input={handleSearch}
on:keyup={handleSearch}
/>
</div>
</div>
<div class="navbar-end hidden lg:flex">
<div class="form-control">
<input
type="text"
placeholder="Search"
class="input md:w-auto"
bind:value={searchQuery}
on:input={handleSearch}
/>
<input type="text" placeholder="Search" class="input md:w-auto" on:keyup={handleSearch} />
</div>
<ul class="menu menu-horizontal px-1">
<li><a href="/thoughts" class="link-primary">Thoughts</a></li>

View File

@@ -31,8 +31,12 @@
{#each results as result}
<li class="py-4">
<h3 class="pb-1">
<a class="link" href={`/poetry/${result.poem.id}`}>{slugToTitle(result.poem.id)}</a>
<p class="text-sm">(Relevance: {(result.similarity * 100).toFixed(3)}%)</p>
<a class="link" href={`/${result.post.section}/${result.post.filename}`} target="_blank"
>{slugToTitle(result.post.id)}</a
>
<p class="text-sm">
(Relevance: {(result.similarity * 100).toFixed(2)}%, Section: {result.post.section})
</p>
</h3>
</li>
{/each}

View File

@@ -1,7 +1,6 @@
export interface Metadata {
title: string;
date: string;
content: string;
categories?: string[];
draft?: boolean;
}
@@ -9,18 +8,25 @@ export interface Metadata {
export interface Section {
poetry: 'poetry';
thoughts: 'thoughts';
projects: 'projects';
services: 'services';
all: 'all';
}
type SectionKey = keyof Section;
export interface Post {
meta: Metadata;
path: string;
content: string;
section: string;
filename: string;
id: string;
}
interface Data {
metadata: Metadata;
default: {
render: () => { html: string };
};
}
function isData(obj: unknown): obj is Data {
@@ -39,20 +45,19 @@ function isData(obj: unknown): obj is Data {
export const fetchMarkdownPosts = async (
section: SectionKey,
limit: number,
offset: number
limit?: number,
offset?: number
): Promise<{ posts: Post[]; total: number }> => {
let posts: Record<string, () => Promise<unknown>>;
switch (section) {
case 'all':
posts = import.meta.glob('/src/posts/**/*.md');
break;
case 'poetry':
posts = import.meta.glob('/src/posts/poetry/*.md');
break;
case 'projects':
posts = import.meta.glob('/src/routes/(app)/projects/posts/*.md');
break;
case 'thoughts':
posts = import.meta.glob('/src/posts/thoughts/*.md');
console.log(posts);
break;
default:
throw new Error('Could not find this section');
@@ -70,10 +75,18 @@ export const fetchMarkdownPosts = async (
return undefined;
}
const { metadata } = data;
const postPath = path.slice(11, -3);
const { html } = data.default.render();
// remove html tags
const content = html.replace(/<[^>]*>/g, '');
const section = path.split('/')[3];
const filename = path.split('/').pop()?.slice(0, -3);
return {
meta: { ...metadata },
path: postPath
content,
section,
filename,
id: data.metadata.title
};
} else {
console.error('Could not properly parse this post');
@@ -94,6 +107,10 @@ export const fetchMarkdownPosts = async (
(b, a) => new Date(a?.meta.date || '').getTime() - new Date(b?.meta.date || '').getTime()
);
if (limit === undefined || offset === undefined) {
return { posts: sortedPosts, total: allPosts.length };
}
const paginatedPosts = sortedPosts.slice(offset, offset + limit);
return { posts: paginatedPosts, total: allPosts.length };

File diff suppressed because one or more lines are too long

View File

@@ -8,10 +8,12 @@ import * as tf from '@tensorflow/tfjs-node';
export type Embedding = {
id: string;
vector: number[];
section: string;
filename: string;
};
export type SearchResult = {
poem: Embedding;
post: Embedding;
similarity: number;
};

View File

@@ -1,40 +0,0 @@
// eslint-disable-next-line
import * as tf from '@tensorflow/tfjs-node';
import poemEmbeddings from '$lib/utils/poetry/embeddings.json';
import { json } from '@sveltejs/kit';
import { getModel, type Embedding, type SearchResult } from '$lib/utils/search';
// Search handler
export const GET = async ({ url }: { url: URL }) => {
const model = await getModel();
const searchQuery = url.searchParams.get('q');
if (!searchQuery) {
return { status: 400, body: { error: 'Query parameter "q" is required' } };
}
try {
// Generate embedding for the query
const queryEmbedding = await model.embed([searchQuery]);
const queryVec = queryEmbedding.arraySync()[0];
// Calculate similarities
const results = poemEmbeddings
.map((poem: Embedding) => ({
poem,
similarity: cosineSimilarity(queryVec, poem.vector)
}))
.sort((a: SearchResult, b: SearchResult) => b.similarity - a.similarity)
.slice(0, 10); // Top 10 results
return json(results);
} catch (error) {
return { status: 500, body: { error: (error as Error).message } };
}
};
function cosineSimilarity(vecA: number[], vecB: number[]) {
const dotProduct = vecA.reduce((acc, val, i) => acc + val * vecB[i], 0);
const magnitudeA = Math.sqrt(vecA.reduce((acc, val) => acc + val * val, 0));
const magnitudeB = Math.sqrt(vecB.reduce((acc, val) => acc + val * val, 0));
return dotProduct / (magnitudeA * magnitudeB);
}

View File

@@ -0,0 +1,88 @@
// eslint-disable-next-line
import * as tf from '@tensorflow/tfjs-node';
import postEmbeddings from '$lib/utils/poetry/embeddings.json';
import { json } from '@sveltejs/kit';
import { getModel, type Embedding, type SearchResult } from '$lib/utils/search';
import { fetchMarkdownPosts } from '$lib/utils';
import Fuse from 'fuse.js';
// Search handler
export const GET = async ({ url }: { url: URL }) => {
const searchQuery = url.searchParams.get('q');
if (!searchQuery) {
return { status: 400, body: { error: 'Query parameter "q" is required' } };
}
try {
const model = await getModel();
const { posts } = await fetchMarkdownPosts('all');
const fuse = new Fuse(posts, {
keys: ['content', 'meta.title', 'meta.tags'],
includeScore: true
});
// Fuzzy search
const fuzzyResults = fuse.search(searchQuery, { limit: 10 });
// Generate embedding for the query
const queryEmbedding = await model.embed([searchQuery]);
const queryVec = queryEmbedding.arraySync()[0];
// Calculate similarities
let semanticResults = postEmbeddings.map((post: Embedding) => ({
post,
similarity: cosineSimilarity(queryVec, post.vector)
}));
// add fuzzy results to semantic results
semanticResults = semanticResults.map((semanticResult) => {
const fuzzyResultIndex = fuzzyResults.findIndex(
(fuzzyResult) =>
fuzzyResult.item.section === semanticResult.post.section &&
fuzzyResult.item.filename === semanticResult.post.filename
);
if (fuzzyResultIndex > -1) {
const fuzzyResult = fuzzyResults.splice(fuzzyResultIndex, 1)[0];
if (fuzzyResult.score && 1 - fuzzyResult.score > 0.8) {
semanticResult.similarity = 1 - fuzzyResult.score / 2 + semanticResult.similarity;
}
}
return semanticResult;
});
// add rest of fuzzy results
semanticResults.push(
...fuzzyResults.map((fuzzyResult) => {
let similarity = 0;
if (fuzzyResult.score && 1 - fuzzyResult.score > 0.9) {
similarity = 1 - fuzzyResult.score / 2;
}
return {
post: {
id: fuzzyResult.item.id,
section: fuzzyResult.item.section,
filename: fuzzyResult.item.filename,
vector: [0, 0, 0]
},
similarity: similarity
};
})
);
semanticResults = semanticResults
.sort((a: SearchResult, b: SearchResult) => b.similarity - a.similarity)
.slice(0, 10);
return json(semanticResults);
} catch (error) {
return { status: 500, body: { error: (error as Error).message } };
}
};
function cosineSimilarity(vecA: number[], vecB: number[]) {
const dotProduct = vecA.reduce((acc, val, i) => acc + val * vecB[i], 0);
const magnitudeA = Math.sqrt(vecA.reduce((acc, val) => acc + val * val, 0));
const magnitudeB = Math.sqrt(vecB.reduce((acc, val) => acc + val * val, 0));
return dotProduct / (magnitudeA * magnitudeB);
}