playground/scripts/generate-embeddings.js

50 lines
1.7 KiB
JavaScript

// eslint-disable-next-line @typescript-eslint/no-unused-vars
import * as tf from '@tensorflow/tfjs-node';
import use from '@tensorflow-models/universal-sentence-encoder';
import fs from 'fs-extra';
import glob from 'glob';
import path from 'path';
import { marked } from 'marked';
async function extractTextFromMarkdown(filePath) {
const markdown = await fs.readFile(filePath, 'utf8');
// remove yaml frontmatter metadata
const result = markdown.replace(/---[\s\S]*?---/gm, '');
// remove html tags
const text = marked(result).replace(/<[^>]*>/g, '');
return text;
}
async function generateEmbeddingsForDirectory(directoryPath) {
// Get all markdown files in directory
const files = glob.sync(`${directoryPath}/**/*.md`, {
ignore: [`${directoryPath}/LICENSE.md`]
});
// Extract texts from markdown files
// eslint-disable-next-line @typescript-eslint/no-unused-vars
const posts = await Promise.all(files.map(async (file, _index) => ({
id: path.basename(file, '.md'), // Use filename as ID,
text: await extractTextFromMarkdown(file),
section: path.basename(path.dirname(file)),
filename: path.basename(file)
})));
// Load the Universal Sentence Encoder model
const model = await use.load();
const embeddings = await Promise.all(posts.map(post => model.embed([post.text])));
// Map embeddings back to post objects
const poemEmbeddings = posts.map((post, index) => ({
id: post.id,
vector: embeddings[index].arraySync()[0], // Extract the vector
section: post.section,
filename: post.id
}));
// Save embeddings to JSON file
fs.writeJson('embeddings.json', poemEmbeddings);
}
generateEmbeddingsForDirectory('src/posts'); // Update path accordingly