add fuzzy search and combine it with semantic search results, update readme, change license file extension

This commit is contained in:
Silas 2024-06-10 20:51:35 -04:00
parent 3242c3bc77
commit 3320683d10
Failed to generate hash of commit
13 changed files with 176 additions and 88 deletions

View File

View File

@ -35,6 +35,6 @@ It will traverse through every `*.md` under `src/posts/poetry` and generate the
## License ## License
This project is licensed under the [MIT License](src/branch/main/LICENSE.md). This means that you can do pretty much anything you want with this code as long as you include the original copyright and license notice in your project. This project is licensed under the [MIT License](src/branch/main/LICENSE). This means that you can do pretty much anything you want with this code as long as you include the original copyright and license notice in your project.
Content contained in the `src/posts` folder is under the [CC BY-NC-SA-4.0](src/branch/main/src/posts/LICENSE.md) license. You're free to modify/distribute the posts contained in the `src/posts` folder so long as it's not for commercial purposes, you give attribution, and point out any modifications you've made. Content contained in the `src/posts` folder is under the [CC BY-NC-SA-4.0](src/branch/main/src/posts/LICENSE) license. You're free to modify/distribute the posts contained in the `src/posts` folder so long as it's not for commercial purposes, you give attribution, and point out any modifications you've made.

9
package-lock.json generated
View File

@ -14,6 +14,7 @@
"@threlte/core": "^7.3.0", "@threlte/core": "^7.3.0",
"@threlte/extras": "^8.11.2", "@threlte/extras": "^8.11.2",
"@threlte/rapier": "^2.0.0", "@threlte/rapier": "^2.0.0",
"fuse.js": "^7.0.0",
"marked": "^12.0.2", "marked": "^12.0.2",
"mdsvex": "^0.11.0", "mdsvex": "^0.11.0",
"three": "^0.159.0" "three": "^0.159.0"
@ -3427,6 +3428,14 @@
"url": "https://github.com/sponsors/ljharb" "url": "https://github.com/sponsors/ljharb"
} }
}, },
"node_modules/fuse.js": {
"version": "7.0.0",
"resolved": "https://registry.npmjs.org/fuse.js/-/fuse.js-7.0.0.tgz",
"integrity": "sha512-14F4hBIxqKvD4Zz/XjDc3y94mNZN6pRv3U13Udo0lNLCWRBUsrMv2xwcF/y/Z5sV6+FQW+/ow68cHpm4sunt8Q==",
"engines": {
"node": ">=10"
}
},
"node_modules/gauge": { "node_modules/gauge": {
"version": "3.0.2", "version": "3.0.2",
"resolved": "https://registry.npmjs.org/gauge/-/gauge-3.0.2.tgz", "resolved": "https://registry.npmjs.org/gauge/-/gauge-3.0.2.tgz",

View File

@ -52,6 +52,7 @@
"@threlte/core": "^7.3.0", "@threlte/core": "^7.3.0",
"@threlte/extras": "^8.11.2", "@threlte/extras": "^8.11.2",
"@threlte/rapier": "^2.0.0", "@threlte/rapier": "^2.0.0",
"fuse.js": "^7.0.0",
"marked": "^12.0.2", "marked": "^12.0.2",
"mdsvex": "^0.11.0", "mdsvex": "^0.11.0",
"three": "^0.159.0" "three": "^0.159.0"

View File

@ -8,32 +8,42 @@ import { marked } from 'marked';
async function extractTextFromMarkdown(filePath) { async function extractTextFromMarkdown(filePath) {
const markdown = await fs.readFile(filePath, 'utf8'); const markdown = await fs.readFile(filePath, 'utf8');
return marked(markdown).replace(/<[^>]*>/g, ''); // Strip HTML tags generated by marked // remove yaml frontmatter metadata
const result = markdown.replace(/---[\s\S]*?---/gm, '');
// remove html tags
const text = marked(result).replace(/<[^>]*>/g, '');
return text;
} }
async function generateEmbeddingsForDirectory(directoryPath) { async function generateEmbeddingsForDirectory(directoryPath) {
// Get all markdown files in directory // Get all markdown files in directory
const files = glob.sync(`${directoryPath}/*.md`); const files = glob.sync(`${directoryPath}/**/*.md`, {
ignore: [`${directoryPath}/LICENSE.md`]
});
// Extract texts from markdown files // Extract texts from markdown files
// eslint-disable-next-line @typescript-eslint/no-unused-vars // eslint-disable-next-line @typescript-eslint/no-unused-vars
const poems = await Promise.all(files.map(async (file, _index) => ({ const posts = await Promise.all(files.map(async (file, _index) => ({
id: path.basename(file, '.md'), // Use filename as ID id: path.basename(file, '.md'), // Use filename as ID,
text: await extractTextFromMarkdown(file) text: await extractTextFromMarkdown(file),
section: path.basename(path.dirname(file)),
filename: path.basename(file)
}))); })));
// Load the Universal Sentence Encoder model // Load the Universal Sentence Encoder model
const model = await use.load(); const model = await use.load();
const embeddings = await Promise.all(poems.map(poem => model.embed([poem.text]))); const embeddings = await Promise.all(posts.map(post => model.embed([post.text])));
// Map embeddings back to poem objects // Map embeddings back to post objects
const poemEmbeddings = poems.map((poem, index) => ({ const poemEmbeddings = posts.map((post, index) => ({
id: poem.id, id: post.id,
vector: embeddings[index].arraySync()[0] // Extract the vector vector: embeddings[index].arraySync()[0], // Extract the vector
section: post.section,
filename: post.id
})); }));
// Save embeddings to JSON file // Save embeddings to JSON file
fs.writeJson('embeddings.json', poemEmbeddings); fs.writeJson('embeddings.json', poemEmbeddings);
} }
generateEmbeddingsForDirectory('src/posts/poetry'); // Update path accordingly generateEmbeddingsForDirectory('src/posts'); // Update path accordingly

View File

@ -2,18 +2,22 @@
import type { SearchResult } from '$lib/utils/search'; import type { SearchResult } from '$lib/utils/search';
import { searchResults } from '$lib/store'; import { searchResults } from '$lib/store';
let searchQuery = ''; let timer: NodeJS.Timeout | undefined;
async function handleSearch() { async function handleSearch({ target }: Event) {
// const section = window.location.pathname.split('/')[1]; const { value } = target as HTMLInputElement;
const response = await fetch(`/api/poetry/search?q=${encodeURIComponent(searchQuery)}`); clearTimeout(timer);
if (response.ok) { timer = setTimeout(async () => {
const data: SearchResult[] = await response.json(); // const section = window.location.pathname.split('/')[1];
searchResults.set(data); const response = await fetch(`/api/search?q=${encodeURIComponent(value)}`);
} else { if (response.ok) {
console.error('Failed to fetch search results'); const data: SearchResult[] = await response.json();
searchResults.set([]); searchResults.set(data);
} } else {
console.error('Failed to fetch search results');
searchResults.set([]);
}
}, 300);
} }
</script> </script>
@ -54,20 +58,13 @@
type="text" type="text"
placeholder="Search" placeholder="Search"
class="input w-24 md:w-auto" class="input w-24 md:w-auto"
bind:value={searchQuery} on:keyup={handleSearch}
on:input={handleSearch}
/> />
</div> </div>
</div> </div>
<div class="navbar-end hidden lg:flex"> <div class="navbar-end hidden lg:flex">
<div class="form-control"> <div class="form-control">
<input <input type="text" placeholder="Search" class="input md:w-auto" on:keyup={handleSearch} />
type="text"
placeholder="Search"
class="input md:w-auto"
bind:value={searchQuery}
on:input={handleSearch}
/>
</div> </div>
<ul class="menu menu-horizontal px-1"> <ul class="menu menu-horizontal px-1">
<li><a href="/thoughts" class="link-primary">Thoughts</a></li> <li><a href="/thoughts" class="link-primary">Thoughts</a></li>

View File

@ -31,8 +31,12 @@
{#each results as result} {#each results as result}
<li class="py-4"> <li class="py-4">
<h3 class="pb-1"> <h3 class="pb-1">
<a class="link" href={`/poetry/${result.poem.id}`}>{slugToTitle(result.poem.id)}</a> <a class="link" href={`/${result.post.section}/${result.post.filename}`} target="_blank"
<p class="text-sm">(Relevance: {(result.similarity * 100).toFixed(3)}%)</p> >{slugToTitle(result.post.id)}</a
>
<p class="text-sm">
(Relevance: {(result.similarity * 100).toFixed(2)}%, Section: {result.post.section})
</p>
</h3> </h3>
</li> </li>
{/each} {/each}

View File

@ -1,7 +1,6 @@
export interface Metadata { export interface Metadata {
title: string; title: string;
date: string; date: string;
content: string;
categories?: string[]; categories?: string[];
draft?: boolean; draft?: boolean;
} }
@ -9,18 +8,25 @@ export interface Metadata {
export interface Section { export interface Section {
poetry: 'poetry'; poetry: 'poetry';
thoughts: 'thoughts'; thoughts: 'thoughts';
projects: 'projects'; services: 'services';
all: 'all';
} }
type SectionKey = keyof Section; type SectionKey = keyof Section;
export interface Post { export interface Post {
meta: Metadata; meta: Metadata;
path: string; content: string;
section: string;
filename: string;
id: string;
} }
interface Data { interface Data {
metadata: Metadata; metadata: Metadata;
default: {
render: () => { html: string };
};
} }
function isData(obj: unknown): obj is Data { function isData(obj: unknown): obj is Data {
@ -39,20 +45,19 @@ function isData(obj: unknown): obj is Data {
export const fetchMarkdownPosts = async ( export const fetchMarkdownPosts = async (
section: SectionKey, section: SectionKey,
limit: number, limit?: number,
offset: number offset?: number
): Promise<{ posts: Post[]; total: number }> => { ): Promise<{ posts: Post[]; total: number }> => {
let posts: Record<string, () => Promise<unknown>>; let posts: Record<string, () => Promise<unknown>>;
switch (section) { switch (section) {
case 'all':
posts = import.meta.glob('/src/posts/**/*.md');
break;
case 'poetry': case 'poetry':
posts = import.meta.glob('/src/posts/poetry/*.md'); posts = import.meta.glob('/src/posts/poetry/*.md');
break; break;
case 'projects':
posts = import.meta.glob('/src/routes/(app)/projects/posts/*.md');
break;
case 'thoughts': case 'thoughts':
posts = import.meta.glob('/src/posts/thoughts/*.md'); posts = import.meta.glob('/src/posts/thoughts/*.md');
console.log(posts);
break; break;
default: default:
throw new Error('Could not find this section'); throw new Error('Could not find this section');
@ -70,10 +75,18 @@ export const fetchMarkdownPosts = async (
return undefined; return undefined;
} }
const { metadata } = data; const { metadata } = data;
const postPath = path.slice(11, -3); const { html } = data.default.render();
// remove html tags
const content = html.replace(/<[^>]*>/g, '');
const section = path.split('/')[3];
const filename = path.split('/').pop()?.slice(0, -3);
return { return {
meta: { ...metadata }, meta: { ...metadata },
path: postPath content,
section,
filename,
id: data.metadata.title
}; };
} else { } else {
console.error('Could not properly parse this post'); console.error('Could not properly parse this post');
@ -94,6 +107,10 @@ export const fetchMarkdownPosts = async (
(b, a) => new Date(a?.meta.date || '').getTime() - new Date(b?.meta.date || '').getTime() (b, a) => new Date(a?.meta.date || '').getTime() - new Date(b?.meta.date || '').getTime()
); );
if (limit === undefined || offset === undefined) {
return { posts: sortedPosts, total: allPosts.length };
}
const paginatedPosts = sortedPosts.slice(offset, offset + limit); const paginatedPosts = sortedPosts.slice(offset, offset + limit);
return { posts: paginatedPosts, total: allPosts.length }; return { posts: paginatedPosts, total: allPosts.length };

File diff suppressed because one or more lines are too long

View File

@ -8,10 +8,12 @@ import * as tf from '@tensorflow/tfjs-node';
export type Embedding = { export type Embedding = {
id: string; id: string;
vector: number[]; vector: number[];
section: string;
filename: string;
}; };
export type SearchResult = { export type SearchResult = {
poem: Embedding; post: Embedding;
similarity: number; similarity: number;
}; };

View File

@ -1,40 +0,0 @@
// eslint-disable-next-line
import * as tf from '@tensorflow/tfjs-node';
import poemEmbeddings from '$lib/utils/poetry/embeddings.json';
import { json } from '@sveltejs/kit';
import { getModel, type Embedding, type SearchResult } from '$lib/utils/search';
// Search handler
export const GET = async ({ url }: { url: URL }) => {
const model = await getModel();
const searchQuery = url.searchParams.get('q');
if (!searchQuery) {
return { status: 400, body: { error: 'Query parameter "q" is required' } };
}
try {
// Generate embedding for the query
const queryEmbedding = await model.embed([searchQuery]);
const queryVec = queryEmbedding.arraySync()[0];
// Calculate similarities
const results = poemEmbeddings
.map((poem: Embedding) => ({
poem,
similarity: cosineSimilarity(queryVec, poem.vector)
}))
.sort((a: SearchResult, b: SearchResult) => b.similarity - a.similarity)
.slice(0, 10); // Top 10 results
return json(results);
} catch (error) {
return { status: 500, body: { error: (error as Error).message } };
}
};
function cosineSimilarity(vecA: number[], vecB: number[]) {
const dotProduct = vecA.reduce((acc, val, i) => acc + val * vecB[i], 0);
const magnitudeA = Math.sqrt(vecA.reduce((acc, val) => acc + val * val, 0));
const magnitudeB = Math.sqrt(vecB.reduce((acc, val) => acc + val * val, 0));
return dotProduct / (magnitudeA * magnitudeB);
}

View File

@ -0,0 +1,88 @@
// eslint-disable-next-line
import * as tf from '@tensorflow/tfjs-node';
import postEmbeddings from '$lib/utils/poetry/embeddings.json';
import { json } from '@sveltejs/kit';
import { getModel, type Embedding, type SearchResult } from '$lib/utils/search';
import { fetchMarkdownPosts } from '$lib/utils';
import Fuse from 'fuse.js';
// Search handler
export const GET = async ({ url }: { url: URL }) => {
const searchQuery = url.searchParams.get('q');
if (!searchQuery) {
return { status: 400, body: { error: 'Query parameter "q" is required' } };
}
try {
const model = await getModel();
const { posts } = await fetchMarkdownPosts('all');
const fuse = new Fuse(posts, {
keys: ['content', 'meta.title', 'meta.tags'],
includeScore: true
});
// Fuzzy search
const fuzzyResults = fuse.search(searchQuery, { limit: 10 });
// Generate embedding for the query
const queryEmbedding = await model.embed([searchQuery]);
const queryVec = queryEmbedding.arraySync()[0];
// Calculate similarities
let semanticResults = postEmbeddings.map((post: Embedding) => ({
post,
similarity: cosineSimilarity(queryVec, post.vector)
}));
// add fuzzy results to semantic results
semanticResults = semanticResults.map((semanticResult) => {
const fuzzyResultIndex = fuzzyResults.findIndex(
(fuzzyResult) =>
fuzzyResult.item.section === semanticResult.post.section &&
fuzzyResult.item.filename === semanticResult.post.filename
);
if (fuzzyResultIndex > -1) {
const fuzzyResult = fuzzyResults.splice(fuzzyResultIndex, 1)[0];
if (fuzzyResult.score && 1 - fuzzyResult.score > 0.8) {
semanticResult.similarity = 1 - fuzzyResult.score / 2 + semanticResult.similarity;
}
}
return semanticResult;
});
// add rest of fuzzy results
semanticResults.push(
...fuzzyResults.map((fuzzyResult) => {
let similarity = 0;
if (fuzzyResult.score && 1 - fuzzyResult.score > 0.9) {
similarity = 1 - fuzzyResult.score / 2;
}
return {
post: {
id: fuzzyResult.item.id,
section: fuzzyResult.item.section,
filename: fuzzyResult.item.filename,
vector: [0, 0, 0]
},
similarity: similarity
};
})
);
semanticResults = semanticResults
.sort((a: SearchResult, b: SearchResult) => b.similarity - a.similarity)
.slice(0, 10);
return json(semanticResults);
} catch (error) {
return { status: 500, body: { error: (error as Error).message } };
}
};
function cosineSimilarity(vecA: number[], vecB: number[]) {
const dotProduct = vecA.reduce((acc, val, i) => acc + val * vecB[i], 0);
const magnitudeA = Math.sqrt(vecA.reduce((acc, val) => acc + val * val, 0));
const magnitudeB = Math.sqrt(vecB.reduce((acc, val) => acc + val * val, 0));
return dotProduct / (magnitudeA * magnitudeB);
}