playground/scripts/generate-embeddings.js

// eslint-disable-next-line @typescript-eslint/no-unused-vars
import * as tf from '@tensorflow/tfjs-node';
import use from '@tensorflow-models/universal-sentence-encoder';
import fs from 'fs-extra';
import glob from 'glob';
import path from 'path';
import { marked } from 'marked';

async function extractTextFromMarkdown(filePath) {
    const markdown = await fs.readFile(filePath, 'utf8');
		// remove yaml frontmatter metadata
		const result = markdown.replace(/---[\s\S]*?---/gm, '');
		// remove html tags
		const text = marked(result).replace(/<[^>]*>/g, '');
		return text;
}

async function generateEmbeddingsForDirectory(directoryPath) {
    // Get all markdown files in directory
    const files = glob.sync(`${directoryPath}/**/*.md`, {
			ignore: [`${directoryPath}/LICENSE.md`]
		});

    // Extract texts from markdown files
    // eslint-disable-next-line @typescript-eslint/no-unused-vars
    const posts = await Promise.all(files.map(async (file, _index) => ({
        id: path.basename(file, '.md'),  // Use filename as ID,
        text: await extractTextFromMarkdown(file),
				section: path.basename(path.dirname(file)),
				filename: path.basename(file)
    })));

    // Load the Universal Sentence Encoder model
    const model = await use.load();
    const embeddings = await Promise.all(posts.map(post => model.embed([post.text])));

    // Map embeddings back to post objects
    const poemEmbeddings = posts.map((post, index) => ({
        id: post.id,
        vector: embeddings[index].arraySync()[0],  // Extract the vector
				section: post.section,
				filename: post.id
    }));

    // Save embeddings to JSON file
    fs.writeJson('embeddings.json', poemEmbeddings);
}

generateEmbeddingsForDirectory('src/posts');  // Update path accordingly