add fuzzy search and combine it with semantic search results, update readme, change license file extension
This commit is contained in:
parent
3242c3bc77
commit
3320683d10
|
@ -35,6 +35,6 @@ It will traverse through every `*.md` under `src/posts/poetry` and generate the
|
|||
|
||||
## License
|
||||
|
||||
This project is licensed under the [MIT License](src/branch/main/LICENSE.md). This means that you can do pretty much anything you want with this code as long as you include the original copyright and license notice in your project.
|
||||
This project is licensed under the [MIT License](src/branch/main/LICENSE). This means that you can do pretty much anything you want with this code as long as you include the original copyright and license notice in your project.
|
||||
|
||||
Content contained in the `src/posts` folder is under the [CC BY-NC-SA-4.0](src/branch/main/src/posts/LICENSE.md) license. You're free to modify/distribute the posts contained in the `src/posts` folder so long as it's not for commercial purposes, you give attribution, and point out any modifications you've made.
|
||||
Content contained in the `src/posts` folder is under the [CC BY-NC-SA-4.0](src/branch/main/src/posts/LICENSE) license. You're free to modify/distribute the posts contained in the `src/posts` folder so long as it's not for commercial purposes, you give attribution, and point out any modifications you've made.
|
||||
|
|
|
@ -14,6 +14,7 @@
|
|||
"@threlte/core": "^7.3.0",
|
||||
"@threlte/extras": "^8.11.2",
|
||||
"@threlte/rapier": "^2.0.0",
|
||||
"fuse.js": "^7.0.0",
|
||||
"marked": "^12.0.2",
|
||||
"mdsvex": "^0.11.0",
|
||||
"three": "^0.159.0"
|
||||
|
@ -3427,6 +3428,14 @@
|
|||
"url": "https://github.com/sponsors/ljharb"
|
||||
}
|
||||
},
|
||||
"node_modules/fuse.js": {
|
||||
"version": "7.0.0",
|
||||
"resolved": "https://registry.npmjs.org/fuse.js/-/fuse.js-7.0.0.tgz",
|
||||
"integrity": "sha512-14F4hBIxqKvD4Zz/XjDc3y94mNZN6pRv3U13Udo0lNLCWRBUsrMv2xwcF/y/Z5sV6+FQW+/ow68cHpm4sunt8Q==",
|
||||
"engines": {
|
||||
"node": ">=10"
|
||||
}
|
||||
},
|
||||
"node_modules/gauge": {
|
||||
"version": "3.0.2",
|
||||
"resolved": "https://registry.npmjs.org/gauge/-/gauge-3.0.2.tgz",
|
||||
|
|
|
@ -52,6 +52,7 @@
|
|||
"@threlte/core": "^7.3.0",
|
||||
"@threlte/extras": "^8.11.2",
|
||||
"@threlte/rapier": "^2.0.0",
|
||||
"fuse.js": "^7.0.0",
|
||||
"marked": "^12.0.2",
|
||||
"mdsvex": "^0.11.0",
|
||||
"three": "^0.159.0"
|
||||
|
|
|
@ -8,32 +8,42 @@ import { marked } from 'marked';
|
|||
|
||||
async function extractTextFromMarkdown(filePath) {
|
||||
const markdown = await fs.readFile(filePath, 'utf8');
|
||||
return marked(markdown).replace(/<[^>]*>/g, ''); // Strip HTML tags generated by marked
|
||||
// remove yaml frontmatter metadata
|
||||
const result = markdown.replace(/---[\s\S]*?---/gm, '');
|
||||
// remove html tags
|
||||
const text = marked(result).replace(/<[^>]*>/g, '');
|
||||
return text;
|
||||
}
|
||||
|
||||
async function generateEmbeddingsForDirectory(directoryPath) {
|
||||
// Get all markdown files in directory
|
||||
const files = glob.sync(`${directoryPath}/*.md`);
|
||||
const files = glob.sync(`${directoryPath}/**/*.md`, {
|
||||
ignore: [`${directoryPath}/LICENSE.md`]
|
||||
});
|
||||
|
||||
// Extract texts from markdown files
|
||||
// eslint-disable-next-line @typescript-eslint/no-unused-vars
|
||||
const poems = await Promise.all(files.map(async (file, _index) => ({
|
||||
id: path.basename(file, '.md'), // Use filename as ID
|
||||
text: await extractTextFromMarkdown(file)
|
||||
const posts = await Promise.all(files.map(async (file, _index) => ({
|
||||
id: path.basename(file, '.md'), // Use filename as ID,
|
||||
text: await extractTextFromMarkdown(file),
|
||||
section: path.basename(path.dirname(file)),
|
||||
filename: path.basename(file)
|
||||
})));
|
||||
|
||||
// Load the Universal Sentence Encoder model
|
||||
const model = await use.load();
|
||||
const embeddings = await Promise.all(poems.map(poem => model.embed([poem.text])));
|
||||
const embeddings = await Promise.all(posts.map(post => model.embed([post.text])));
|
||||
|
||||
// Map embeddings back to poem objects
|
||||
const poemEmbeddings = poems.map((poem, index) => ({
|
||||
id: poem.id,
|
||||
vector: embeddings[index].arraySync()[0] // Extract the vector
|
||||
// Map embeddings back to post objects
|
||||
const poemEmbeddings = posts.map((post, index) => ({
|
||||
id: post.id,
|
||||
vector: embeddings[index].arraySync()[0], // Extract the vector
|
||||
section: post.section,
|
||||
filename: post.id
|
||||
}));
|
||||
|
||||
// Save embeddings to JSON file
|
||||
fs.writeJson('embeddings.json', poemEmbeddings);
|
||||
}
|
||||
|
||||
generateEmbeddingsForDirectory('src/posts/poetry'); // Update path accordingly
|
||||
generateEmbeddingsForDirectory('src/posts'); // Update path accordingly
|
||||
|
|
|
@ -2,18 +2,22 @@
|
|||
import type { SearchResult } from '$lib/utils/search';
|
||||
import { searchResults } from '$lib/store';
|
||||
|
||||
let searchQuery = '';
|
||||
let timer: NodeJS.Timeout | undefined;
|
||||
|
||||
async function handleSearch() {
|
||||
// const section = window.location.pathname.split('/')[1];
|
||||
const response = await fetch(`/api/poetry/search?q=${encodeURIComponent(searchQuery)}`);
|
||||
if (response.ok) {
|
||||
const data: SearchResult[] = await response.json();
|
||||
searchResults.set(data);
|
||||
} else {
|
||||
console.error('Failed to fetch search results');
|
||||
searchResults.set([]);
|
||||
}
|
||||
async function handleSearch({ target }: Event) {
|
||||
const { value } = target as HTMLInputElement;
|
||||
clearTimeout(timer);
|
||||
timer = setTimeout(async () => {
|
||||
// const section = window.location.pathname.split('/')[1];
|
||||
const response = await fetch(`/api/search?q=${encodeURIComponent(value)}`);
|
||||
if (response.ok) {
|
||||
const data: SearchResult[] = await response.json();
|
||||
searchResults.set(data);
|
||||
} else {
|
||||
console.error('Failed to fetch search results');
|
||||
searchResults.set([]);
|
||||
}
|
||||
}, 300);
|
||||
}
|
||||
</script>
|
||||
|
||||
|
@ -54,20 +58,13 @@
|
|||
type="text"
|
||||
placeholder="Search"
|
||||
class="input w-24 md:w-auto"
|
||||
bind:value={searchQuery}
|
||||
on:input={handleSearch}
|
||||
on:keyup={handleSearch}
|
||||
/>
|
||||
</div>
|
||||
</div>
|
||||
<div class="navbar-end hidden lg:flex">
|
||||
<div class="form-control">
|
||||
<input
|
||||
type="text"
|
||||
placeholder="Search"
|
||||
class="input md:w-auto"
|
||||
bind:value={searchQuery}
|
||||
on:input={handleSearch}
|
||||
/>
|
||||
<input type="text" placeholder="Search" class="input md:w-auto" on:keyup={handleSearch} />
|
||||
</div>
|
||||
<ul class="menu menu-horizontal px-1">
|
||||
<li><a href="/thoughts" class="link-primary">Thoughts</a></li>
|
||||
|
|
|
@ -31,8 +31,12 @@
|
|||
{#each results as result}
|
||||
<li class="py-4">
|
||||
<h3 class="pb-1">
|
||||
<a class="link" href={`/poetry/${result.poem.id}`}>{slugToTitle(result.poem.id)}</a>
|
||||
<p class="text-sm">(Relevance: {(result.similarity * 100).toFixed(3)}%)</p>
|
||||
<a class="link" href={`/${result.post.section}/${result.post.filename}`} target="_blank"
|
||||
>{slugToTitle(result.post.id)}</a
|
||||
>
|
||||
<p class="text-sm">
|
||||
(Relevance: {(result.similarity * 100).toFixed(2)}%, Section: {result.post.section})
|
||||
</p>
|
||||
</h3>
|
||||
</li>
|
||||
{/each}
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
export interface Metadata {
|
||||
title: string;
|
||||
date: string;
|
||||
content: string;
|
||||
categories?: string[];
|
||||
draft?: boolean;
|
||||
}
|
||||
|
@ -9,18 +8,25 @@ export interface Metadata {
|
|||
export interface Section {
|
||||
poetry: 'poetry';
|
||||
thoughts: 'thoughts';
|
||||
projects: 'projects';
|
||||
services: 'services';
|
||||
all: 'all';
|
||||
}
|
||||
|
||||
type SectionKey = keyof Section;
|
||||
|
||||
export interface Post {
|
||||
meta: Metadata;
|
||||
path: string;
|
||||
content: string;
|
||||
section: string;
|
||||
filename: string;
|
||||
id: string;
|
||||
}
|
||||
|
||||
interface Data {
|
||||
metadata: Metadata;
|
||||
default: {
|
||||
render: () => { html: string };
|
||||
};
|
||||
}
|
||||
|
||||
function isData(obj: unknown): obj is Data {
|
||||
|
@ -39,20 +45,19 @@ function isData(obj: unknown): obj is Data {
|
|||
|
||||
export const fetchMarkdownPosts = async (
|
||||
section: SectionKey,
|
||||
limit: number,
|
||||
offset: number
|
||||
limit?: number,
|
||||
offset?: number
|
||||
): Promise<{ posts: Post[]; total: number }> => {
|
||||
let posts: Record<string, () => Promise<unknown>>;
|
||||
switch (section) {
|
||||
case 'all':
|
||||
posts = import.meta.glob('/src/posts/**/*.md');
|
||||
break;
|
||||
case 'poetry':
|
||||
posts = import.meta.glob('/src/posts/poetry/*.md');
|
||||
break;
|
||||
case 'projects':
|
||||
posts = import.meta.glob('/src/routes/(app)/projects/posts/*.md');
|
||||
break;
|
||||
case 'thoughts':
|
||||
posts = import.meta.glob('/src/posts/thoughts/*.md');
|
||||
console.log(posts);
|
||||
break;
|
||||
default:
|
||||
throw new Error('Could not find this section');
|
||||
|
@ -70,10 +75,18 @@ export const fetchMarkdownPosts = async (
|
|||
return undefined;
|
||||
}
|
||||
const { metadata } = data;
|
||||
const postPath = path.slice(11, -3);
|
||||
const { html } = data.default.render();
|
||||
// remove html tags
|
||||
const content = html.replace(/<[^>]*>/g, '');
|
||||
const section = path.split('/')[3];
|
||||
const filename = path.split('/').pop()?.slice(0, -3);
|
||||
|
||||
return {
|
||||
meta: { ...metadata },
|
||||
path: postPath
|
||||
content,
|
||||
section,
|
||||
filename,
|
||||
id: data.metadata.title
|
||||
};
|
||||
} else {
|
||||
console.error('Could not properly parse this post');
|
||||
|
@ -94,6 +107,10 @@ export const fetchMarkdownPosts = async (
|
|||
(b, a) => new Date(a?.meta.date || '').getTime() - new Date(b?.meta.date || '').getTime()
|
||||
);
|
||||
|
||||
if (limit === undefined || offset === undefined) {
|
||||
return { posts: sortedPosts, total: allPosts.length };
|
||||
}
|
||||
|
||||
const paginatedPosts = sortedPosts.slice(offset, offset + limit);
|
||||
|
||||
return { posts: paginatedPosts, total: allPosts.length };
|
||||
|
|
File diff suppressed because one or more lines are too long
|
@ -8,10 +8,12 @@ import * as tf from '@tensorflow/tfjs-node';
|
|||
export type Embedding = {
|
||||
id: string;
|
||||
vector: number[];
|
||||
section: string;
|
||||
filename: string;
|
||||
};
|
||||
|
||||
export type SearchResult = {
|
||||
poem: Embedding;
|
||||
post: Embedding;
|
||||
similarity: number;
|
||||
};
|
||||
|
||||
|
|
|
@ -1,40 +0,0 @@
|
|||
// eslint-disable-next-line
|
||||
import * as tf from '@tensorflow/tfjs-node';
|
||||
import poemEmbeddings from '$lib/utils/poetry/embeddings.json';
|
||||
import { json } from '@sveltejs/kit';
|
||||
import { getModel, type Embedding, type SearchResult } from '$lib/utils/search';
|
||||
|
||||
// Search handler
|
||||
export const GET = async ({ url }: { url: URL }) => {
|
||||
const model = await getModel();
|
||||
const searchQuery = url.searchParams.get('q');
|
||||
if (!searchQuery) {
|
||||
return { status: 400, body: { error: 'Query parameter "q" is required' } };
|
||||
}
|
||||
|
||||
try {
|
||||
// Generate embedding for the query
|
||||
const queryEmbedding = await model.embed([searchQuery]);
|
||||
const queryVec = queryEmbedding.arraySync()[0];
|
||||
|
||||
// Calculate similarities
|
||||
const results = poemEmbeddings
|
||||
.map((poem: Embedding) => ({
|
||||
poem,
|
||||
similarity: cosineSimilarity(queryVec, poem.vector)
|
||||
}))
|
||||
.sort((a: SearchResult, b: SearchResult) => b.similarity - a.similarity)
|
||||
.slice(0, 10); // Top 10 results
|
||||
|
||||
return json(results);
|
||||
} catch (error) {
|
||||
return { status: 500, body: { error: (error as Error).message } };
|
||||
}
|
||||
};
|
||||
|
||||
function cosineSimilarity(vecA: number[], vecB: number[]) {
|
||||
const dotProduct = vecA.reduce((acc, val, i) => acc + val * vecB[i], 0);
|
||||
const magnitudeA = Math.sqrt(vecA.reduce((acc, val) => acc + val * val, 0));
|
||||
const magnitudeB = Math.sqrt(vecB.reduce((acc, val) => acc + val * val, 0));
|
||||
return dotProduct / (magnitudeA * magnitudeB);
|
||||
}
|
|
@ -0,0 +1,88 @@
|
|||
// eslint-disable-next-line
|
||||
import * as tf from '@tensorflow/tfjs-node';
|
||||
import postEmbeddings from '$lib/utils/poetry/embeddings.json';
|
||||
import { json } from '@sveltejs/kit';
|
||||
import { getModel, type Embedding, type SearchResult } from '$lib/utils/search';
|
||||
import { fetchMarkdownPosts } from '$lib/utils';
|
||||
import Fuse from 'fuse.js';
|
||||
|
||||
// Search handler
|
||||
export const GET = async ({ url }: { url: URL }) => {
|
||||
const searchQuery = url.searchParams.get('q');
|
||||
if (!searchQuery) {
|
||||
return { status: 400, body: { error: 'Query parameter "q" is required' } };
|
||||
}
|
||||
|
||||
try {
|
||||
const model = await getModel();
|
||||
const { posts } = await fetchMarkdownPosts('all');
|
||||
const fuse = new Fuse(posts, {
|
||||
keys: ['content', 'meta.title', 'meta.tags'],
|
||||
includeScore: true
|
||||
});
|
||||
|
||||
// Fuzzy search
|
||||
const fuzzyResults = fuse.search(searchQuery, { limit: 10 });
|
||||
|
||||
// Generate embedding for the query
|
||||
const queryEmbedding = await model.embed([searchQuery]);
|
||||
const queryVec = queryEmbedding.arraySync()[0];
|
||||
|
||||
// Calculate similarities
|
||||
let semanticResults = postEmbeddings.map((post: Embedding) => ({
|
||||
post,
|
||||
similarity: cosineSimilarity(queryVec, post.vector)
|
||||
}));
|
||||
|
||||
// add fuzzy results to semantic results
|
||||
semanticResults = semanticResults.map((semanticResult) => {
|
||||
const fuzzyResultIndex = fuzzyResults.findIndex(
|
||||
(fuzzyResult) =>
|
||||
fuzzyResult.item.section === semanticResult.post.section &&
|
||||
fuzzyResult.item.filename === semanticResult.post.filename
|
||||
);
|
||||
|
||||
if (fuzzyResultIndex > -1) {
|
||||
const fuzzyResult = fuzzyResults.splice(fuzzyResultIndex, 1)[0];
|
||||
|
||||
if (fuzzyResult.score && 1 - fuzzyResult.score > 0.8) {
|
||||
semanticResult.similarity = 1 - fuzzyResult.score / 2 + semanticResult.similarity;
|
||||
}
|
||||
}
|
||||
return semanticResult;
|
||||
});
|
||||
|
||||
// add rest of fuzzy results
|
||||
semanticResults.push(
|
||||
...fuzzyResults.map((fuzzyResult) => {
|
||||
let similarity = 0;
|
||||
if (fuzzyResult.score && 1 - fuzzyResult.score > 0.9) {
|
||||
similarity = 1 - fuzzyResult.score / 2;
|
||||
}
|
||||
return {
|
||||
post: {
|
||||
id: fuzzyResult.item.id,
|
||||
section: fuzzyResult.item.section,
|
||||
filename: fuzzyResult.item.filename,
|
||||
vector: [0, 0, 0]
|
||||
},
|
||||
similarity: similarity
|
||||
};
|
||||
})
|
||||
);
|
||||
semanticResults = semanticResults
|
||||
.sort((a: SearchResult, b: SearchResult) => b.similarity - a.similarity)
|
||||
.slice(0, 10);
|
||||
|
||||
return json(semanticResults);
|
||||
} catch (error) {
|
||||
return { status: 500, body: { error: (error as Error).message } };
|
||||
}
|
||||
};
|
||||
|
||||
function cosineSimilarity(vecA: number[], vecB: number[]) {
|
||||
const dotProduct = vecA.reduce((acc, val, i) => acc + val * vecB[i], 0);
|
||||
const magnitudeA = Math.sqrt(vecA.reduce((acc, val) => acc + val * val, 0));
|
||||
const magnitudeB = Math.sqrt(vecB.reduce((acc, val) => acc + val * val, 0));
|
||||
return dotProduct / (magnitudeA * magnitudeB);
|
||||
}
|
Loading…
Reference in New Issue