add fuzzy search and combine it with semantic search results, update readme, change license file extension

This commit is contained in:
Silas 2024-06-10 20:51:35 -04:00
parent 3242c3bc77
commit 3320683d10
Failed to generate hash of commit
13 changed files with 176 additions and 88 deletions

View File

View File

@ -35,6 +35,6 @@ It will traverse through every `*.md` under `src/posts/poetry` and generate the
## License
This project is licensed under the [MIT License](src/branch/main/LICENSE.md). This means that you can do pretty much anything you want with this code as long as you include the original copyright and license notice in your project.
This project is licensed under the [MIT License](src/branch/main/LICENSE). This means that you can do pretty much anything you want with this code as long as you include the original copyright and license notice in your project.
Content contained in the `src/posts` folder is under the [CC BY-NC-SA-4.0](src/branch/main/src/posts/LICENSE.md) license. You're free to modify/distribute the posts contained in the `src/posts` folder so long as it's not for commercial purposes, you give attribution, and point out any modifications you've made.
Content contained in the `src/posts` folder is under the [CC BY-NC-SA-4.0](src/branch/main/src/posts/LICENSE) license. You're free to modify/distribute the posts contained in the `src/posts` folder so long as it's not for commercial purposes, you give attribution, and point out any modifications you've made.

9
package-lock.json generated
View File

@ -14,6 +14,7 @@
"@threlte/core": "^7.3.0",
"@threlte/extras": "^8.11.2",
"@threlte/rapier": "^2.0.0",
"fuse.js": "^7.0.0",
"marked": "^12.0.2",
"mdsvex": "^0.11.0",
"three": "^0.159.0"
@ -3427,6 +3428,14 @@
"url": "https://github.com/sponsors/ljharb"
}
},
"node_modules/fuse.js": {
"version": "7.0.0",
"resolved": "https://registry.npmjs.org/fuse.js/-/fuse.js-7.0.0.tgz",
"integrity": "sha512-14F4hBIxqKvD4Zz/XjDc3y94mNZN6pRv3U13Udo0lNLCWRBUsrMv2xwcF/y/Z5sV6+FQW+/ow68cHpm4sunt8Q==",
"engines": {
"node": ">=10"
}
},
"node_modules/gauge": {
"version": "3.0.2",
"resolved": "https://registry.npmjs.org/gauge/-/gauge-3.0.2.tgz",

View File

@ -52,6 +52,7 @@
"@threlte/core": "^7.3.0",
"@threlte/extras": "^8.11.2",
"@threlte/rapier": "^2.0.0",
"fuse.js": "^7.0.0",
"marked": "^12.0.2",
"mdsvex": "^0.11.0",
"three": "^0.159.0"

View File

@ -8,32 +8,42 @@ import { marked } from 'marked';
async function extractTextFromMarkdown(filePath) {
const markdown = await fs.readFile(filePath, 'utf8');
return marked(markdown).replace(/<[^>]*>/g, ''); // Strip HTML tags generated by marked
// remove yaml frontmatter metadata
const result = markdown.replace(/---[\s\S]*?---/gm, '');
// remove html tags
const text = marked(result).replace(/<[^>]*>/g, '');
return text;
}
async function generateEmbeddingsForDirectory(directoryPath) {
// Get all markdown files in directory
const files = glob.sync(`${directoryPath}/*.md`);
const files = glob.sync(`${directoryPath}/**/*.md`, {
ignore: [`${directoryPath}/LICENSE.md`]
});
// Extract texts from markdown files
// eslint-disable-next-line @typescript-eslint/no-unused-vars
const poems = await Promise.all(files.map(async (file, _index) => ({
id: path.basename(file, '.md'), // Use filename as ID
text: await extractTextFromMarkdown(file)
const posts = await Promise.all(files.map(async (file, _index) => ({
id: path.basename(file, '.md'), // Use filename as ID,
text: await extractTextFromMarkdown(file),
section: path.basename(path.dirname(file)),
filename: path.basename(file)
})));
// Load the Universal Sentence Encoder model
const model = await use.load();
const embeddings = await Promise.all(poems.map(poem => model.embed([poem.text])));
const embeddings = await Promise.all(posts.map(post => model.embed([post.text])));
// Map embeddings back to poem objects
const poemEmbeddings = poems.map((poem, index) => ({
id: poem.id,
vector: embeddings[index].arraySync()[0] // Extract the vector
// Map embeddings back to post objects
const poemEmbeddings = posts.map((post, index) => ({
id: post.id,
vector: embeddings[index].arraySync()[0], // Extract the vector
section: post.section,
filename: post.id
}));
// Save embeddings to JSON file
fs.writeJson('embeddings.json', poemEmbeddings);
}
generateEmbeddingsForDirectory('src/posts/poetry'); // Update path accordingly
generateEmbeddingsForDirectory('src/posts'); // Update path accordingly

View File

@ -2,18 +2,22 @@
import type { SearchResult } from '$lib/utils/search';
import { searchResults } from '$lib/store';
let searchQuery = '';
let timer: NodeJS.Timeout | undefined;
async function handleSearch() {
// const section = window.location.pathname.split('/')[1];
const response = await fetch(`/api/poetry/search?q=${encodeURIComponent(searchQuery)}`);
if (response.ok) {
const data: SearchResult[] = await response.json();
searchResults.set(data);
} else {
console.error('Failed to fetch search results');
searchResults.set([]);
}
async function handleSearch({ target }: Event) {
const { value } = target as HTMLInputElement;
clearTimeout(timer);
timer = setTimeout(async () => {
// const section = window.location.pathname.split('/')[1];
const response = await fetch(`/api/search?q=${encodeURIComponent(value)}`);
if (response.ok) {
const data: SearchResult[] = await response.json();
searchResults.set(data);
} else {
console.error('Failed to fetch search results');
searchResults.set([]);
}
}, 300);
}
</script>
@ -54,20 +58,13 @@
type="text"
placeholder="Search"
class="input w-24 md:w-auto"
bind:value={searchQuery}
on:input={handleSearch}
on:keyup={handleSearch}
/>
</div>
</div>
<div class="navbar-end hidden lg:flex">
<div class="form-control">
<input
type="text"
placeholder="Search"
class="input md:w-auto"
bind:value={searchQuery}
on:input={handleSearch}
/>
<input type="text" placeholder="Search" class="input md:w-auto" on:keyup={handleSearch} />
</div>
<ul class="menu menu-horizontal px-1">
<li><a href="/thoughts" class="link-primary">Thoughts</a></li>

View File

@ -31,8 +31,12 @@
{#each results as result}
<li class="py-4">
<h3 class="pb-1">
<a class="link" href={`/poetry/${result.poem.id}`}>{slugToTitle(result.poem.id)}</a>
<p class="text-sm">(Relevance: {(result.similarity * 100).toFixed(3)}%)</p>
<a class="link" href={`/${result.post.section}/${result.post.filename}`} target="_blank"
>{slugToTitle(result.post.id)}</a
>
<p class="text-sm">
(Relevance: {(result.similarity * 100).toFixed(2)}%, Section: {result.post.section})
</p>
</h3>
</li>
{/each}

View File

@ -1,7 +1,6 @@
export interface Metadata {
title: string;
date: string;
content: string;
categories?: string[];
draft?: boolean;
}
@ -9,18 +8,25 @@ export interface Metadata {
export interface Section {
poetry: 'poetry';
thoughts: 'thoughts';
projects: 'projects';
services: 'services';
all: 'all';
}
type SectionKey = keyof Section;
export interface Post {
meta: Metadata;
path: string;
content: string;
section: string;
filename: string;
id: string;
}
interface Data {
metadata: Metadata;
default: {
render: () => { html: string };
};
}
function isData(obj: unknown): obj is Data {
@ -39,20 +45,19 @@ function isData(obj: unknown): obj is Data {
export const fetchMarkdownPosts = async (
section: SectionKey,
limit: number,
offset: number
limit?: number,
offset?: number
): Promise<{ posts: Post[]; total: number }> => {
let posts: Record<string, () => Promise<unknown>>;
switch (section) {
case 'all':
posts = import.meta.glob('/src/posts/**/*.md');
break;
case 'poetry':
posts = import.meta.glob('/src/posts/poetry/*.md');
break;
case 'projects':
posts = import.meta.glob('/src/routes/(app)/projects/posts/*.md');
break;
case 'thoughts':
posts = import.meta.glob('/src/posts/thoughts/*.md');
console.log(posts);
break;
default:
throw new Error('Could not find this section');
@ -70,10 +75,18 @@ export const fetchMarkdownPosts = async (
return undefined;
}
const { metadata } = data;
const postPath = path.slice(11, -3);
const { html } = data.default.render();
// remove html tags
const content = html.replace(/<[^>]*>/g, '');
const section = path.split('/')[3];
const filename = path.split('/').pop()?.slice(0, -3);
return {
meta: { ...metadata },
path: postPath
content,
section,
filename,
id: data.metadata.title
};
} else {
console.error('Could not properly parse this post');
@ -94,6 +107,10 @@ export const fetchMarkdownPosts = async (
(b, a) => new Date(a?.meta.date || '').getTime() - new Date(b?.meta.date || '').getTime()
);
if (limit === undefined || offset === undefined) {
return { posts: sortedPosts, total: allPosts.length };
}
const paginatedPosts = sortedPosts.slice(offset, offset + limit);
return { posts: paginatedPosts, total: allPosts.length };

File diff suppressed because one or more lines are too long

View File

@ -8,10 +8,12 @@ import * as tf from '@tensorflow/tfjs-node';
export type Embedding = {
id: string;
vector: number[];
section: string;
filename: string;
};
export type SearchResult = {
poem: Embedding;
post: Embedding;
similarity: number;
};

View File

@ -1,40 +0,0 @@
// eslint-disable-next-line
import * as tf from '@tensorflow/tfjs-node';
import poemEmbeddings from '$lib/utils/poetry/embeddings.json';
import { json } from '@sveltejs/kit';
import { getModel, type Embedding, type SearchResult } from '$lib/utils/search';
// Search handler
export const GET = async ({ url }: { url: URL }) => {
const model = await getModel();
const searchQuery = url.searchParams.get('q');
if (!searchQuery) {
return { status: 400, body: { error: 'Query parameter "q" is required' } };
}
try {
// Generate embedding for the query
const queryEmbedding = await model.embed([searchQuery]);
const queryVec = queryEmbedding.arraySync()[0];
// Calculate similarities
const results = poemEmbeddings
.map((poem: Embedding) => ({
poem,
similarity: cosineSimilarity(queryVec, poem.vector)
}))
.sort((a: SearchResult, b: SearchResult) => b.similarity - a.similarity)
.slice(0, 10); // Top 10 results
return json(results);
} catch (error) {
return { status: 500, body: { error: (error as Error).message } };
}
};
function cosineSimilarity(vecA: number[], vecB: number[]) {
const dotProduct = vecA.reduce((acc, val, i) => acc + val * vecB[i], 0);
const magnitudeA = Math.sqrt(vecA.reduce((acc, val) => acc + val * val, 0));
const magnitudeB = Math.sqrt(vecB.reduce((acc, val) => acc + val * val, 0));
return dotProduct / (magnitudeA * magnitudeB);
}

View File

@ -0,0 +1,88 @@
// eslint-disable-next-line
import * as tf from '@tensorflow/tfjs-node';
import postEmbeddings from '$lib/utils/poetry/embeddings.json';
import { json } from '@sveltejs/kit';
import { getModel, type Embedding, type SearchResult } from '$lib/utils/search';
import { fetchMarkdownPosts } from '$lib/utils';
import Fuse from 'fuse.js';
// Search handler
export const GET = async ({ url }: { url: URL }) => {
const searchQuery = url.searchParams.get('q');
if (!searchQuery) {
return { status: 400, body: { error: 'Query parameter "q" is required' } };
}
try {
const model = await getModel();
const { posts } = await fetchMarkdownPosts('all');
const fuse = new Fuse(posts, {
keys: ['content', 'meta.title', 'meta.tags'],
includeScore: true
});
// Fuzzy search
const fuzzyResults = fuse.search(searchQuery, { limit: 10 });
// Generate embedding for the query
const queryEmbedding = await model.embed([searchQuery]);
const queryVec = queryEmbedding.arraySync()[0];
// Calculate similarities
let semanticResults = postEmbeddings.map((post: Embedding) => ({
post,
similarity: cosineSimilarity(queryVec, post.vector)
}));
// add fuzzy results to semantic results
semanticResults = semanticResults.map((semanticResult) => {
const fuzzyResultIndex = fuzzyResults.findIndex(
(fuzzyResult) =>
fuzzyResult.item.section === semanticResult.post.section &&
fuzzyResult.item.filename === semanticResult.post.filename
);
if (fuzzyResultIndex > -1) {
const fuzzyResult = fuzzyResults.splice(fuzzyResultIndex, 1)[0];
if (fuzzyResult.score && 1 - fuzzyResult.score > 0.8) {
semanticResult.similarity = 1 - fuzzyResult.score / 2 + semanticResult.similarity;
}
}
return semanticResult;
});
// add rest of fuzzy results
semanticResults.push(
...fuzzyResults.map((fuzzyResult) => {
let similarity = 0;
if (fuzzyResult.score && 1 - fuzzyResult.score > 0.9) {
similarity = 1 - fuzzyResult.score / 2;
}
return {
post: {
id: fuzzyResult.item.id,
section: fuzzyResult.item.section,
filename: fuzzyResult.item.filename,
vector: [0, 0, 0]
},
similarity: similarity
};
})
);
semanticResults = semanticResults
.sort((a: SearchResult, b: SearchResult) => b.similarity - a.similarity)
.slice(0, 10);
return json(semanticResults);
} catch (error) {
return { status: 500, body: { error: (error as Error).message } };
}
};
function cosineSimilarity(vecA: number[], vecB: number[]) {
const dotProduct = vecA.reduce((acc, val, i) => acc + val * vecB[i], 0);
const magnitudeA = Math.sqrt(vecA.reduce((acc, val) => acc + val * val, 0));
const magnitudeB = Math.sqrt(vecB.reduce((acc, val) => acc + val * val, 0));
return dotProduct / (magnitudeA * magnitudeB);
}