playground/scripts/generate-embeddings.js

// eslint-disable-next-line @typescript-eslint/no-unused-vars
import * as tf from '@tensorflow/tfjs-node';
import use from '@tensorflow-models/universal-sentence-encoder';
import fs from 'fs-extra';
import glob from 'glob';
import path from 'path';
import { marked } from 'marked';

async function extractTextFromMarkdown(filePath) {
    const markdown = await fs.readFile(filePath, 'utf8');
    return marked(markdown).replace(/<[^>]*>/g, ''); // Strip HTML tags generated by marked
}

async function generateEmbeddingsForDirectory(directoryPath) {
    // Get all markdown files in directory
    const files = glob.sync(`${directoryPath}/*.md`);

    // Extract texts from markdown files
    // eslint-disable-next-line @typescript-eslint/no-unused-vars
    const poems = await Promise.all(files.map(async (file, _index) => ({
        id: path.basename(file, '.md'),  // Use filename as ID
        text: await extractTextFromMarkdown(file)
    })));

    // Load the Universal Sentence Encoder model
    const model = await use.load();
    const embeddings = await Promise.all(poems.map(poem => model.embed([poem.text])));

    // Map embeddings back to poem objects
    const poemEmbeddings = poems.map((poem, index) => ({
        id: poem.id,
        vector: embeddings[index].arraySync()[0]  // Extract the vector
    }));

    // Save embeddings to JSON file
    fs.writeJson('embeddings.json', poemEmbeddings);
}

generateEmbeddingsForDirectory('src/posts/poetry');  // Update path accordingly