40 lines
1.5 KiB
JavaScript
40 lines
1.5 KiB
JavaScript
// eslint-disable-next-line @typescript-eslint/no-unused-vars
|
|
import * as tf from '@tensorflow/tfjs-node';
|
|
import use from '@tensorflow-models/universal-sentence-encoder';
|
|
import fs from 'fs-extra';
|
|
import glob from 'glob';
|
|
import path from 'path';
|
|
import { marked } from 'marked';
|
|
|
|
async function extractTextFromMarkdown(filePath) {
|
|
const markdown = await fs.readFile(filePath, 'utf8');
|
|
return marked(markdown).replace(/<[^>]*>/g, ''); // Strip HTML tags generated by marked
|
|
}
|
|
|
|
async function generateEmbeddingsForDirectory(directoryPath) {
|
|
// Get all markdown files in directory
|
|
const files = glob.sync(`${directoryPath}/*.md`);
|
|
|
|
// Extract texts from markdown files
|
|
// eslint-disable-next-line @typescript-eslint/no-unused-vars
|
|
const poems = await Promise.all(files.map(async (file, _index) => ({
|
|
id: path.basename(file, '.md'), // Use filename as ID
|
|
text: await extractTextFromMarkdown(file)
|
|
})));
|
|
|
|
// Load the Universal Sentence Encoder model
|
|
const model = await use.load();
|
|
const embeddings = await Promise.all(poems.map(poem => model.embed([poem.text])));
|
|
|
|
// Map embeddings back to poem objects
|
|
const poemEmbeddings = poems.map((poem, index) => ({
|
|
id: poem.id,
|
|
vector: embeddings[index].arraySync()[0] // Extract the vector
|
|
}));
|
|
|
|
// Save embeddings to JSON file
|
|
fs.writeJson('embeddings.json', poemEmbeddings);
|
|
}
|
|
|
|
generateEmbeddingsForDirectory('src/posts/poetry'); // Update path accordingly
|