add fuzzy search and combine it with semantic search results, update readme, change license file extension
This commit is contained in:
parent
3242c3bc77
commit
3320683d10
|
@ -35,6 +35,6 @@ It will traverse through every `*.md` under `src/posts/poetry` and generate the
|
||||||
|
|
||||||
## License
|
## License
|
||||||
|
|
||||||
This project is licensed under the [MIT License](src/branch/main/LICENSE.md). This means that you can do pretty much anything you want with this code as long as you include the original copyright and license notice in your project.
|
This project is licensed under the [MIT License](src/branch/main/LICENSE). This means that you can do pretty much anything you want with this code as long as you include the original copyright and license notice in your project.
|
||||||
|
|
||||||
Content contained in the `src/posts` folder is under the [CC BY-NC-SA-4.0](src/branch/main/src/posts/LICENSE.md) license. You're free to modify/distribute the posts contained in the `src/posts` folder so long as it's not for commercial purposes, you give attribution, and point out any modifications you've made.
|
Content contained in the `src/posts` folder is under the [CC BY-NC-SA-4.0](src/branch/main/src/posts/LICENSE) license. You're free to modify/distribute the posts contained in the `src/posts` folder so long as it's not for commercial purposes, you give attribution, and point out any modifications you've made.
|
||||||
|
|
|
@ -14,6 +14,7 @@
|
||||||
"@threlte/core": "^7.3.0",
|
"@threlte/core": "^7.3.0",
|
||||||
"@threlte/extras": "^8.11.2",
|
"@threlte/extras": "^8.11.2",
|
||||||
"@threlte/rapier": "^2.0.0",
|
"@threlte/rapier": "^2.0.0",
|
||||||
|
"fuse.js": "^7.0.0",
|
||||||
"marked": "^12.0.2",
|
"marked": "^12.0.2",
|
||||||
"mdsvex": "^0.11.0",
|
"mdsvex": "^0.11.0",
|
||||||
"three": "^0.159.0"
|
"three": "^0.159.0"
|
||||||
|
@ -3427,6 +3428,14 @@
|
||||||
"url": "https://github.com/sponsors/ljharb"
|
"url": "https://github.com/sponsors/ljharb"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"node_modules/fuse.js": {
|
||||||
|
"version": "7.0.0",
|
||||||
|
"resolved": "https://registry.npmjs.org/fuse.js/-/fuse.js-7.0.0.tgz",
|
||||||
|
"integrity": "sha512-14F4hBIxqKvD4Zz/XjDc3y94mNZN6pRv3U13Udo0lNLCWRBUsrMv2xwcF/y/Z5sV6+FQW+/ow68cHpm4sunt8Q==",
|
||||||
|
"engines": {
|
||||||
|
"node": ">=10"
|
||||||
|
}
|
||||||
|
},
|
||||||
"node_modules/gauge": {
|
"node_modules/gauge": {
|
||||||
"version": "3.0.2",
|
"version": "3.0.2",
|
||||||
"resolved": "https://registry.npmjs.org/gauge/-/gauge-3.0.2.tgz",
|
"resolved": "https://registry.npmjs.org/gauge/-/gauge-3.0.2.tgz",
|
||||||
|
|
|
@ -52,6 +52,7 @@
|
||||||
"@threlte/core": "^7.3.0",
|
"@threlte/core": "^7.3.0",
|
||||||
"@threlte/extras": "^8.11.2",
|
"@threlte/extras": "^8.11.2",
|
||||||
"@threlte/rapier": "^2.0.0",
|
"@threlte/rapier": "^2.0.0",
|
||||||
|
"fuse.js": "^7.0.0",
|
||||||
"marked": "^12.0.2",
|
"marked": "^12.0.2",
|
||||||
"mdsvex": "^0.11.0",
|
"mdsvex": "^0.11.0",
|
||||||
"three": "^0.159.0"
|
"three": "^0.159.0"
|
||||||
|
|
|
@ -8,32 +8,42 @@ import { marked } from 'marked';
|
||||||
|
|
||||||
async function extractTextFromMarkdown(filePath) {
|
async function extractTextFromMarkdown(filePath) {
|
||||||
const markdown = await fs.readFile(filePath, 'utf8');
|
const markdown = await fs.readFile(filePath, 'utf8');
|
||||||
return marked(markdown).replace(/<[^>]*>/g, ''); // Strip HTML tags generated by marked
|
// remove yaml frontmatter metadata
|
||||||
|
const result = markdown.replace(/---[\s\S]*?---/gm, '');
|
||||||
|
// remove html tags
|
||||||
|
const text = marked(result).replace(/<[^>]*>/g, '');
|
||||||
|
return text;
|
||||||
}
|
}
|
||||||
|
|
||||||
async function generateEmbeddingsForDirectory(directoryPath) {
|
async function generateEmbeddingsForDirectory(directoryPath) {
|
||||||
// Get all markdown files in directory
|
// Get all markdown files in directory
|
||||||
const files = glob.sync(`${directoryPath}/*.md`);
|
const files = glob.sync(`${directoryPath}/**/*.md`, {
|
||||||
|
ignore: [`${directoryPath}/LICENSE.md`]
|
||||||
|
});
|
||||||
|
|
||||||
// Extract texts from markdown files
|
// Extract texts from markdown files
|
||||||
// eslint-disable-next-line @typescript-eslint/no-unused-vars
|
// eslint-disable-next-line @typescript-eslint/no-unused-vars
|
||||||
const poems = await Promise.all(files.map(async (file, _index) => ({
|
const posts = await Promise.all(files.map(async (file, _index) => ({
|
||||||
id: path.basename(file, '.md'), // Use filename as ID
|
id: path.basename(file, '.md'), // Use filename as ID,
|
||||||
text: await extractTextFromMarkdown(file)
|
text: await extractTextFromMarkdown(file),
|
||||||
|
section: path.basename(path.dirname(file)),
|
||||||
|
filename: path.basename(file)
|
||||||
})));
|
})));
|
||||||
|
|
||||||
// Load the Universal Sentence Encoder model
|
// Load the Universal Sentence Encoder model
|
||||||
const model = await use.load();
|
const model = await use.load();
|
||||||
const embeddings = await Promise.all(poems.map(poem => model.embed([poem.text])));
|
const embeddings = await Promise.all(posts.map(post => model.embed([post.text])));
|
||||||
|
|
||||||
// Map embeddings back to poem objects
|
// Map embeddings back to post objects
|
||||||
const poemEmbeddings = poems.map((poem, index) => ({
|
const poemEmbeddings = posts.map((post, index) => ({
|
||||||
id: poem.id,
|
id: post.id,
|
||||||
vector: embeddings[index].arraySync()[0] // Extract the vector
|
vector: embeddings[index].arraySync()[0], // Extract the vector
|
||||||
|
section: post.section,
|
||||||
|
filename: post.id
|
||||||
}));
|
}));
|
||||||
|
|
||||||
// Save embeddings to JSON file
|
// Save embeddings to JSON file
|
||||||
fs.writeJson('embeddings.json', poemEmbeddings);
|
fs.writeJson('embeddings.json', poemEmbeddings);
|
||||||
}
|
}
|
||||||
|
|
||||||
generateEmbeddingsForDirectory('src/posts/poetry'); // Update path accordingly
|
generateEmbeddingsForDirectory('src/posts'); // Update path accordingly
|
||||||
|
|
|
@ -2,18 +2,22 @@
|
||||||
import type { SearchResult } from '$lib/utils/search';
|
import type { SearchResult } from '$lib/utils/search';
|
||||||
import { searchResults } from '$lib/store';
|
import { searchResults } from '$lib/store';
|
||||||
|
|
||||||
let searchQuery = '';
|
let timer: NodeJS.Timeout | undefined;
|
||||||
|
|
||||||
async function handleSearch() {
|
async function handleSearch({ target }: Event) {
|
||||||
// const section = window.location.pathname.split('/')[1];
|
const { value } = target as HTMLInputElement;
|
||||||
const response = await fetch(`/api/poetry/search?q=${encodeURIComponent(searchQuery)}`);
|
clearTimeout(timer);
|
||||||
if (response.ok) {
|
timer = setTimeout(async () => {
|
||||||
const data: SearchResult[] = await response.json();
|
// const section = window.location.pathname.split('/')[1];
|
||||||
searchResults.set(data);
|
const response = await fetch(`/api/search?q=${encodeURIComponent(value)}`);
|
||||||
} else {
|
if (response.ok) {
|
||||||
console.error('Failed to fetch search results');
|
const data: SearchResult[] = await response.json();
|
||||||
searchResults.set([]);
|
searchResults.set(data);
|
||||||
}
|
} else {
|
||||||
|
console.error('Failed to fetch search results');
|
||||||
|
searchResults.set([]);
|
||||||
|
}
|
||||||
|
}, 300);
|
||||||
}
|
}
|
||||||
</script>
|
</script>
|
||||||
|
|
||||||
|
@ -54,20 +58,13 @@
|
||||||
type="text"
|
type="text"
|
||||||
placeholder="Search"
|
placeholder="Search"
|
||||||
class="input w-24 md:w-auto"
|
class="input w-24 md:w-auto"
|
||||||
bind:value={searchQuery}
|
on:keyup={handleSearch}
|
||||||
on:input={handleSearch}
|
|
||||||
/>
|
/>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
<div class="navbar-end hidden lg:flex">
|
<div class="navbar-end hidden lg:flex">
|
||||||
<div class="form-control">
|
<div class="form-control">
|
||||||
<input
|
<input type="text" placeholder="Search" class="input md:w-auto" on:keyup={handleSearch} />
|
||||||
type="text"
|
|
||||||
placeholder="Search"
|
|
||||||
class="input md:w-auto"
|
|
||||||
bind:value={searchQuery}
|
|
||||||
on:input={handleSearch}
|
|
||||||
/>
|
|
||||||
</div>
|
</div>
|
||||||
<ul class="menu menu-horizontal px-1">
|
<ul class="menu menu-horizontal px-1">
|
||||||
<li><a href="/thoughts" class="link-primary">Thoughts</a></li>
|
<li><a href="/thoughts" class="link-primary">Thoughts</a></li>
|
||||||
|
|
|
@ -31,8 +31,12 @@
|
||||||
{#each results as result}
|
{#each results as result}
|
||||||
<li class="py-4">
|
<li class="py-4">
|
||||||
<h3 class="pb-1">
|
<h3 class="pb-1">
|
||||||
<a class="link" href={`/poetry/${result.poem.id}`}>{slugToTitle(result.poem.id)}</a>
|
<a class="link" href={`/${result.post.section}/${result.post.filename}`} target="_blank"
|
||||||
<p class="text-sm">(Relevance: {(result.similarity * 100).toFixed(3)}%)</p>
|
>{slugToTitle(result.post.id)}</a
|
||||||
|
>
|
||||||
|
<p class="text-sm">
|
||||||
|
(Relevance: {(result.similarity * 100).toFixed(2)}%, Section: {result.post.section})
|
||||||
|
</p>
|
||||||
</h3>
|
</h3>
|
||||||
</li>
|
</li>
|
||||||
{/each}
|
{/each}
|
||||||
|
|
|
@ -1,7 +1,6 @@
|
||||||
export interface Metadata {
|
export interface Metadata {
|
||||||
title: string;
|
title: string;
|
||||||
date: string;
|
date: string;
|
||||||
content: string;
|
|
||||||
categories?: string[];
|
categories?: string[];
|
||||||
draft?: boolean;
|
draft?: boolean;
|
||||||
}
|
}
|
||||||
|
@ -9,18 +8,25 @@ export interface Metadata {
|
||||||
export interface Section {
|
export interface Section {
|
||||||
poetry: 'poetry';
|
poetry: 'poetry';
|
||||||
thoughts: 'thoughts';
|
thoughts: 'thoughts';
|
||||||
projects: 'projects';
|
services: 'services';
|
||||||
|
all: 'all';
|
||||||
}
|
}
|
||||||
|
|
||||||
type SectionKey = keyof Section;
|
type SectionKey = keyof Section;
|
||||||
|
|
||||||
export interface Post {
|
export interface Post {
|
||||||
meta: Metadata;
|
meta: Metadata;
|
||||||
path: string;
|
content: string;
|
||||||
|
section: string;
|
||||||
|
filename: string;
|
||||||
|
id: string;
|
||||||
}
|
}
|
||||||
|
|
||||||
interface Data {
|
interface Data {
|
||||||
metadata: Metadata;
|
metadata: Metadata;
|
||||||
|
default: {
|
||||||
|
render: () => { html: string };
|
||||||
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
function isData(obj: unknown): obj is Data {
|
function isData(obj: unknown): obj is Data {
|
||||||
|
@ -39,20 +45,19 @@ function isData(obj: unknown): obj is Data {
|
||||||
|
|
||||||
export const fetchMarkdownPosts = async (
|
export const fetchMarkdownPosts = async (
|
||||||
section: SectionKey,
|
section: SectionKey,
|
||||||
limit: number,
|
limit?: number,
|
||||||
offset: number
|
offset?: number
|
||||||
): Promise<{ posts: Post[]; total: number }> => {
|
): Promise<{ posts: Post[]; total: number }> => {
|
||||||
let posts: Record<string, () => Promise<unknown>>;
|
let posts: Record<string, () => Promise<unknown>>;
|
||||||
switch (section) {
|
switch (section) {
|
||||||
|
case 'all':
|
||||||
|
posts = import.meta.glob('/src/posts/**/*.md');
|
||||||
|
break;
|
||||||
case 'poetry':
|
case 'poetry':
|
||||||
posts = import.meta.glob('/src/posts/poetry/*.md');
|
posts = import.meta.glob('/src/posts/poetry/*.md');
|
||||||
break;
|
break;
|
||||||
case 'projects':
|
|
||||||
posts = import.meta.glob('/src/routes/(app)/projects/posts/*.md');
|
|
||||||
break;
|
|
||||||
case 'thoughts':
|
case 'thoughts':
|
||||||
posts = import.meta.glob('/src/posts/thoughts/*.md');
|
posts = import.meta.glob('/src/posts/thoughts/*.md');
|
||||||
console.log(posts);
|
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
throw new Error('Could not find this section');
|
throw new Error('Could not find this section');
|
||||||
|
@ -70,10 +75,18 @@ export const fetchMarkdownPosts = async (
|
||||||
return undefined;
|
return undefined;
|
||||||
}
|
}
|
||||||
const { metadata } = data;
|
const { metadata } = data;
|
||||||
const postPath = path.slice(11, -3);
|
const { html } = data.default.render();
|
||||||
|
// remove html tags
|
||||||
|
const content = html.replace(/<[^>]*>/g, '');
|
||||||
|
const section = path.split('/')[3];
|
||||||
|
const filename = path.split('/').pop()?.slice(0, -3);
|
||||||
|
|
||||||
return {
|
return {
|
||||||
meta: { ...metadata },
|
meta: { ...metadata },
|
||||||
path: postPath
|
content,
|
||||||
|
section,
|
||||||
|
filename,
|
||||||
|
id: data.metadata.title
|
||||||
};
|
};
|
||||||
} else {
|
} else {
|
||||||
console.error('Could not properly parse this post');
|
console.error('Could not properly parse this post');
|
||||||
|
@ -94,6 +107,10 @@ export const fetchMarkdownPosts = async (
|
||||||
(b, a) => new Date(a?.meta.date || '').getTime() - new Date(b?.meta.date || '').getTime()
|
(b, a) => new Date(a?.meta.date || '').getTime() - new Date(b?.meta.date || '').getTime()
|
||||||
);
|
);
|
||||||
|
|
||||||
|
if (limit === undefined || offset === undefined) {
|
||||||
|
return { posts: sortedPosts, total: allPosts.length };
|
||||||
|
}
|
||||||
|
|
||||||
const paginatedPosts = sortedPosts.slice(offset, offset + limit);
|
const paginatedPosts = sortedPosts.slice(offset, offset + limit);
|
||||||
|
|
||||||
return { posts: paginatedPosts, total: allPosts.length };
|
return { posts: paginatedPosts, total: allPosts.length };
|
||||||
|
|
File diff suppressed because one or more lines are too long
|
@ -8,10 +8,12 @@ import * as tf from '@tensorflow/tfjs-node';
|
||||||
export type Embedding = {
|
export type Embedding = {
|
||||||
id: string;
|
id: string;
|
||||||
vector: number[];
|
vector: number[];
|
||||||
|
section: string;
|
||||||
|
filename: string;
|
||||||
};
|
};
|
||||||
|
|
||||||
export type SearchResult = {
|
export type SearchResult = {
|
||||||
poem: Embedding;
|
post: Embedding;
|
||||||
similarity: number;
|
similarity: number;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
@ -1,40 +0,0 @@
|
||||||
// eslint-disable-next-line
|
|
||||||
import * as tf from '@tensorflow/tfjs-node';
|
|
||||||
import poemEmbeddings from '$lib/utils/poetry/embeddings.json';
|
|
||||||
import { json } from '@sveltejs/kit';
|
|
||||||
import { getModel, type Embedding, type SearchResult } from '$lib/utils/search';
|
|
||||||
|
|
||||||
// Search handler
|
|
||||||
export const GET = async ({ url }: { url: URL }) => {
|
|
||||||
const model = await getModel();
|
|
||||||
const searchQuery = url.searchParams.get('q');
|
|
||||||
if (!searchQuery) {
|
|
||||||
return { status: 400, body: { error: 'Query parameter "q" is required' } };
|
|
||||||
}
|
|
||||||
|
|
||||||
try {
|
|
||||||
// Generate embedding for the query
|
|
||||||
const queryEmbedding = await model.embed([searchQuery]);
|
|
||||||
const queryVec = queryEmbedding.arraySync()[0];
|
|
||||||
|
|
||||||
// Calculate similarities
|
|
||||||
const results = poemEmbeddings
|
|
||||||
.map((poem: Embedding) => ({
|
|
||||||
poem,
|
|
||||||
similarity: cosineSimilarity(queryVec, poem.vector)
|
|
||||||
}))
|
|
||||||
.sort((a: SearchResult, b: SearchResult) => b.similarity - a.similarity)
|
|
||||||
.slice(0, 10); // Top 10 results
|
|
||||||
|
|
||||||
return json(results);
|
|
||||||
} catch (error) {
|
|
||||||
return { status: 500, body: { error: (error as Error).message } };
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
function cosineSimilarity(vecA: number[], vecB: number[]) {
|
|
||||||
const dotProduct = vecA.reduce((acc, val, i) => acc + val * vecB[i], 0);
|
|
||||||
const magnitudeA = Math.sqrt(vecA.reduce((acc, val) => acc + val * val, 0));
|
|
||||||
const magnitudeB = Math.sqrt(vecB.reduce((acc, val) => acc + val * val, 0));
|
|
||||||
return dotProduct / (magnitudeA * magnitudeB);
|
|
||||||
}
|
|
|
@ -0,0 +1,88 @@
|
||||||
|
// eslint-disable-next-line
|
||||||
|
import * as tf from '@tensorflow/tfjs-node';
|
||||||
|
import postEmbeddings from '$lib/utils/poetry/embeddings.json';
|
||||||
|
import { json } from '@sveltejs/kit';
|
||||||
|
import { getModel, type Embedding, type SearchResult } from '$lib/utils/search';
|
||||||
|
import { fetchMarkdownPosts } from '$lib/utils';
|
||||||
|
import Fuse from 'fuse.js';
|
||||||
|
|
||||||
|
// Search handler
|
||||||
|
export const GET = async ({ url }: { url: URL }) => {
|
||||||
|
const searchQuery = url.searchParams.get('q');
|
||||||
|
if (!searchQuery) {
|
||||||
|
return { status: 400, body: { error: 'Query parameter "q" is required' } };
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
const model = await getModel();
|
||||||
|
const { posts } = await fetchMarkdownPosts('all');
|
||||||
|
const fuse = new Fuse(posts, {
|
||||||
|
keys: ['content', 'meta.title', 'meta.tags'],
|
||||||
|
includeScore: true
|
||||||
|
});
|
||||||
|
|
||||||
|
// Fuzzy search
|
||||||
|
const fuzzyResults = fuse.search(searchQuery, { limit: 10 });
|
||||||
|
|
||||||
|
// Generate embedding for the query
|
||||||
|
const queryEmbedding = await model.embed([searchQuery]);
|
||||||
|
const queryVec = queryEmbedding.arraySync()[0];
|
||||||
|
|
||||||
|
// Calculate similarities
|
||||||
|
let semanticResults = postEmbeddings.map((post: Embedding) => ({
|
||||||
|
post,
|
||||||
|
similarity: cosineSimilarity(queryVec, post.vector)
|
||||||
|
}));
|
||||||
|
|
||||||
|
// add fuzzy results to semantic results
|
||||||
|
semanticResults = semanticResults.map((semanticResult) => {
|
||||||
|
const fuzzyResultIndex = fuzzyResults.findIndex(
|
||||||
|
(fuzzyResult) =>
|
||||||
|
fuzzyResult.item.section === semanticResult.post.section &&
|
||||||
|
fuzzyResult.item.filename === semanticResult.post.filename
|
||||||
|
);
|
||||||
|
|
||||||
|
if (fuzzyResultIndex > -1) {
|
||||||
|
const fuzzyResult = fuzzyResults.splice(fuzzyResultIndex, 1)[0];
|
||||||
|
|
||||||
|
if (fuzzyResult.score && 1 - fuzzyResult.score > 0.8) {
|
||||||
|
semanticResult.similarity = 1 - fuzzyResult.score / 2 + semanticResult.similarity;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return semanticResult;
|
||||||
|
});
|
||||||
|
|
||||||
|
// add rest of fuzzy results
|
||||||
|
semanticResults.push(
|
||||||
|
...fuzzyResults.map((fuzzyResult) => {
|
||||||
|
let similarity = 0;
|
||||||
|
if (fuzzyResult.score && 1 - fuzzyResult.score > 0.9) {
|
||||||
|
similarity = 1 - fuzzyResult.score / 2;
|
||||||
|
}
|
||||||
|
return {
|
||||||
|
post: {
|
||||||
|
id: fuzzyResult.item.id,
|
||||||
|
section: fuzzyResult.item.section,
|
||||||
|
filename: fuzzyResult.item.filename,
|
||||||
|
vector: [0, 0, 0]
|
||||||
|
},
|
||||||
|
similarity: similarity
|
||||||
|
};
|
||||||
|
})
|
||||||
|
);
|
||||||
|
semanticResults = semanticResults
|
||||||
|
.sort((a: SearchResult, b: SearchResult) => b.similarity - a.similarity)
|
||||||
|
.slice(0, 10);
|
||||||
|
|
||||||
|
return json(semanticResults);
|
||||||
|
} catch (error) {
|
||||||
|
return { status: 500, body: { error: (error as Error).message } };
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
function cosineSimilarity(vecA: number[], vecB: number[]) {
|
||||||
|
const dotProduct = vecA.reduce((acc, val, i) => acc + val * vecB[i], 0);
|
||||||
|
const magnitudeA = Math.sqrt(vecA.reduce((acc, val) => acc + val * val, 0));
|
||||||
|
const magnitudeB = Math.sqrt(vecB.reduce((acc, val) => acc + val * val, 0));
|
||||||
|
return dotProduct / (magnitudeA * magnitudeB);
|
||||||
|
}
|
Loading…
Reference in New Issue