add fuzzy search and combine it with semantic search results, update readme, change license file extension
This commit is contained in:
@@ -8,32 +8,42 @@ import { marked } from 'marked';
|
||||
|
||||
async function extractTextFromMarkdown(filePath) {
|
||||
const markdown = await fs.readFile(filePath, 'utf8');
|
||||
return marked(markdown).replace(/<[^>]*>/g, ''); // Strip HTML tags generated by marked
|
||||
// remove yaml frontmatter metadata
|
||||
const result = markdown.replace(/---[\s\S]*?---/gm, '');
|
||||
// remove html tags
|
||||
const text = marked(result).replace(/<[^>]*>/g, '');
|
||||
return text;
|
||||
}
|
||||
|
||||
async function generateEmbeddingsForDirectory(directoryPath) {
|
||||
// Get all markdown files in directory
|
||||
const files = glob.sync(`${directoryPath}/*.md`);
|
||||
const files = glob.sync(`${directoryPath}/**/*.md`, {
|
||||
ignore: [`${directoryPath}/LICENSE.md`]
|
||||
});
|
||||
|
||||
// Extract texts from markdown files
|
||||
// eslint-disable-next-line @typescript-eslint/no-unused-vars
|
||||
const poems = await Promise.all(files.map(async (file, _index) => ({
|
||||
id: path.basename(file, '.md'), // Use filename as ID
|
||||
text: await extractTextFromMarkdown(file)
|
||||
const posts = await Promise.all(files.map(async (file, _index) => ({
|
||||
id: path.basename(file, '.md'), // Use filename as ID,
|
||||
text: await extractTextFromMarkdown(file),
|
||||
section: path.basename(path.dirname(file)),
|
||||
filename: path.basename(file)
|
||||
})));
|
||||
|
||||
// Load the Universal Sentence Encoder model
|
||||
const model = await use.load();
|
||||
const embeddings = await Promise.all(poems.map(poem => model.embed([poem.text])));
|
||||
const embeddings = await Promise.all(posts.map(post => model.embed([post.text])));
|
||||
|
||||
// Map embeddings back to poem objects
|
||||
const poemEmbeddings = poems.map((poem, index) => ({
|
||||
id: poem.id,
|
||||
vector: embeddings[index].arraySync()[0] // Extract the vector
|
||||
// Map embeddings back to post objects
|
||||
const poemEmbeddings = posts.map((post, index) => ({
|
||||
id: post.id,
|
||||
vector: embeddings[index].arraySync()[0], // Extract the vector
|
||||
section: post.section,
|
||||
filename: post.id
|
||||
}));
|
||||
|
||||
// Save embeddings to JSON file
|
||||
fs.writeJson('embeddings.json', poemEmbeddings);
|
||||
}
|
||||
|
||||
generateEmbeddingsForDirectory('src/posts/poetry'); // Update path accordingly
|
||||
generateEmbeddingsForDirectory('src/posts'); // Update path accordingly
|
||||
|
Reference in New Issue
Block a user