add semantic search

2024-05-31 01:31:37 -04:00
parent 53635f0d59
commit de9cccabda
19 changed files with 1398 additions and 105 deletions
--- a/scripts/generate-embeddings.js
+++ b/scripts/generate-embeddings.js
@@ -0,0 +1,39 @@
+// eslint-disable-next-line @typescript-eslint/no-unused-vars
+import * as tf from '@tensorflow/tfjs-node';
+import use from '@tensorflow-models/universal-sentence-encoder';
+import fs from 'fs-extra';
+import glob from 'glob';
+import path from 'path';
+import { marked } from 'marked';
+
+async function extractTextFromMarkdown(filePath) {
+    const markdown = await fs.readFile(filePath, 'utf8');
+    return marked(markdown).replace(/<[^>]*>/g, ''); // Strip HTML tags generated by marked
+}
+
+async function generateEmbeddingsForDirectory(directoryPath) {
+    // Get all markdown files in directory
+    const files = glob.sync(`${directoryPath}/*.md`);
+
+    // Extract texts from markdown files
+    // eslint-disable-next-line @typescript-eslint/no-unused-vars
+    const poems = await Promise.all(files.map(async (file, _index) => ({
+        id: path.basename(file, '.md'),  // Use filename as ID
+        text: await extractTextFromMarkdown(file)
+    })));
+
+    // Load the Universal Sentence Encoder model
+    const model = await use.load();
+    const embeddings = await Promise.all(poems.map(poem => model.embed([poem.text])));
+
+    // Map embeddings back to poem objects
+    const poemEmbeddings = poems.map((poem, index) => ({
+        id: poem.id,
+        vector: embeddings[index].arraySync()[0]  // Extract the vector
+    }));
+
+    // Save embeddings to JSON file
+    fs.writeJson('embeddings.json', poemEmbeddings);
+}
+
+generateEmbeddingsForDirectory('src/posts/poetry');  // Update path accordingly