get urls working with mozilla's readability library
This commit is contained in:
parent
deb6052de8
commit
a2d0010a77
File diff suppressed because it is too large
Load Diff
|
@ -14,6 +14,7 @@
|
|||
"format": "prettier --plugin-search-dir . --write ."
|
||||
},
|
||||
"devDependencies": {
|
||||
"@fortawesome/free-regular-svg-icons": "^6.4.0",
|
||||
"@playwright/test": "^1.28.1",
|
||||
"@skeletonlabs/skeleton": "^1.1.0",
|
||||
"@sveltejs/adapter-auto": "^2.0.0",
|
||||
|
@ -21,6 +22,7 @@
|
|||
"@sveltejs/kit": "^1.5.0",
|
||||
"@tailwindcss/forms": "^0.5.3",
|
||||
"@tailwindcss/typography": "^0.5.9",
|
||||
"@types/jsdom": "^21.1.1",
|
||||
"@typescript-eslint/eslint-plugin": "^5.45.0",
|
||||
"@typescript-eslint/parser": "^5.45.0",
|
||||
"autoprefixer": "^10.4.14",
|
||||
|
@ -42,6 +44,9 @@
|
|||
"type": "module",
|
||||
"dependencies": {
|
||||
"@dqbd/tiktoken": "^1.0.4",
|
||||
"langchain": "^0.0.51"
|
||||
"@mozilla/readability": "^0.4.4",
|
||||
"jsdom": "^21.1.1",
|
||||
"langchain": "^0.0.51",
|
||||
"svelte-fa": "^3.0.3"
|
||||
}
|
||||
}
|
||||
|
|
|
@ -7,3 +7,12 @@ declare namespace App {
|
|||
// interface Error {}
|
||||
// interface Platform {}
|
||||
}
|
||||
|
||||
declare module '@fortawesome/free-regular-svg-icons/index.es' {
|
||||
export * from '@fortawesome/free-regular-svg-icons';
|
||||
}
|
||||
|
||||
declare module 'svelte-fa/src/fa.svelte' {
|
||||
const Fa: any;
|
||||
export default Fa;
|
||||
}
|
||||
|
|
|
@ -0,0 +1,144 @@
|
|||
// Type definitions for non-npm package mozilla-readability 0.2
|
||||
// Project: https://github.com/mozilla/readability
|
||||
// Definitions by: Charles Vandevoorde <https://github.com/charlesvdv>, Alex Wendland <https://github.com/awendland>
|
||||
// Definitions: https://github.com/DefinitelyTyped/DefinitelyTyped
|
||||
// TypeScript Version: 2.2
|
||||
declare module '@mozilla/readability' {
|
||||
/**
|
||||
* A standalone version of the readability library used for Firefox Reader View.
|
||||
*
|
||||
* Note that isProbablyReaderable() was moved into a separate file in https://github.com/mozilla/readability/commit/2620542dd1e8380220d82afa97a2c283ae636e40
|
||||
* and therefore is no longer part of the Readability class.
|
||||
*/
|
||||
declare class Readability {
|
||||
/**
|
||||
* ## Usage on the web
|
||||
*
|
||||
* To parse a document, you must create a new Readability object from a
|
||||
* DOM document object, and then call parse(). Here's an example:
|
||||
*
|
||||
* ```js
|
||||
* var article = new Readability(document).parse();
|
||||
* ```
|
||||
*
|
||||
* If you're using Readability on the web, you will likely be able to
|
||||
* use a document reference from elsewhere (e.g. fetched via XMLHttpRequest,
|
||||
* in a same-origin <iframe> you have access to, etc.).
|
||||
*
|
||||
* ## Usage from node.js
|
||||
*
|
||||
* In node.js, you won't generally have a DOM document object. To obtain one, you can use external
|
||||
* libraries like [jsdom](https://github.com/tmpvar/jsdom). While this repository contains a parser of
|
||||
* its own (`JSDOMParser`), that is restricted to reading XML-compatible markup and therefore we do
|
||||
* not recommend it for general use.
|
||||
*
|
||||
* If you're using `jsdom` to create a DOM object, you should ensure that the page doesn't run (page)
|
||||
* scripts (avoid fetching remote resources etc.) as well as passing it the page's URI as the `url`
|
||||
* property of the `options` object you pass the `JSDOM` constructor.
|
||||
*
|
||||
* ```js
|
||||
* var JSDOM = require('jsdom').JSDOM;
|
||||
* var doc = new JSDOM("<body>Here's a bunch of text</body>", {
|
||||
* url: "https://www.example.com/the-page-i-got-the-source-from",
|
||||
* });
|
||||
* let reader = new Readability(doc.window.document);
|
||||
* let article = reader.parse();
|
||||
* ```
|
||||
*/
|
||||
constructor(doc: Document, options?: Readability.Options);
|
||||
|
||||
/**
|
||||
* Runs readability.
|
||||
*
|
||||
* ## Workflow:
|
||||
*
|
||||
* 1. Prep the document by removing script tags, css, etc.
|
||||
* 2. Build readability's DOM tree.
|
||||
* 3. Grab the article content from the current dom tree.
|
||||
* 4. Replace the current DOM tree with the new one.
|
||||
* 5. Read peacefully.
|
||||
*
|
||||
* ## Additional notes:
|
||||
*
|
||||
* Readability's parse() works by modifying the DOM. This removes some
|
||||
* elements in the web page. You could avoid this by passing the clone
|
||||
* of the document object while creating a Readability object.
|
||||
*
|
||||
* ```js
|
||||
* var documentClone = document.cloneNode(true);
|
||||
* var article = new Readability(documentClone).parse();
|
||||
* ```
|
||||
*
|
||||
* The response will be null if the processing failed (https://github.com/mozilla/readability/blob/52ab9b5c8916c306a47b2119270dcdabebf9d203/Readability.js#L2038)
|
||||
*/
|
||||
parse(): Readability.ParseResult | null;
|
||||
}
|
||||
|
||||
declare namespace Readability {
|
||||
interface Options {
|
||||
/**
|
||||
* Control whether log messages are sent to the console
|
||||
*/
|
||||
debug?: boolean | undefined;
|
||||
|
||||
/**
|
||||
* Set a maximum size on the documents that will be processed. This size is
|
||||
* checked before any parsing operations occur. If the number of elements in
|
||||
* the document exceeds this threshold then an Error will be thrown.
|
||||
*
|
||||
* See implementation details at https://github.com/mozilla/readability/blob/52ab9b5c8916c306a47b2119270dcdabebf9d203/Readability.js#L2019
|
||||
*/
|
||||
maxElemsToParse?: number | undefined;
|
||||
|
||||
nbTopCandidates?: number | undefined;
|
||||
|
||||
/**
|
||||
* Minimum number of characters in the extracted textContent in order to
|
||||
* consider the article correctly identified. If the threshold is not met then
|
||||
* the extraction process will automatically run again with different flags.
|
||||
*
|
||||
* See implementation details at https://github.com/mozilla/readability/blob/52ab9b5c8916c306a47b2119270dcdabebf9d203/Readability.js#L1208
|
||||
*
|
||||
* Changed from wordThreshold in https://github.com/mozilla/readability/commit/3ff9a166fb27928f222c4c0722e730eda412658a
|
||||
*/
|
||||
charThreshold?: number | undefined;
|
||||
|
||||
/**
|
||||
* parse() removes the class="" attribute from every element in the given
|
||||
* subtree, except those that match CLASSES_TO_PRESERVE and
|
||||
* the classesToPreserve array from the options object.
|
||||
*/
|
||||
classesToPreserve?: string[] | undefined;
|
||||
|
||||
/**
|
||||
* By default Readability will strip all classes from the HTML elements in the
|
||||
* processed article. By setting this to `true` the classes will be retained.
|
||||
*
|
||||
* This is a blanket alternative to `classesToPreserve`.
|
||||
*
|
||||
* Added in https://github.com/mozilla/readability/commit/2982216913af2c66b0690e88606b03116553ad92
|
||||
*/
|
||||
|
||||
keepClasses?: boolean | undefined;
|
||||
}
|
||||
|
||||
interface ParseResult {
|
||||
/** Article title */
|
||||
title: string;
|
||||
/** Author metadata */
|
||||
byline: string;
|
||||
/** Content direction */
|
||||
dir: string;
|
||||
/** HTML string of processed article content */
|
||||
content: string;
|
||||
/** non-HTML version of `content` */
|
||||
textContent: string;
|
||||
/** Length of an article, in characters */
|
||||
length: number;
|
||||
/** Article description, or short excerpt from the content */
|
||||
excerpt: string;
|
||||
/** Article site name */
|
||||
siteName: string;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -14,10 +14,9 @@
|
|||
<!-- App Bar -->
|
||||
<AppBar>
|
||||
<svelte:fragment slot="lead">
|
||||
<strong class="text-xl uppercase">Sumi Ai</strong>
|
||||
<a href="/"><strong class="text-xl uppercase">Sumi Ai</strong></a>
|
||||
</svelte:fragment>
|
||||
<svelte:fragment slot="trail">
|
||||
<a class="btn btn-sm variant-ringed-tertiary" href="/"> Home </a>
|
||||
<a class="btn btn-sm variant-ringed-tertiary" href="/about"> About </a>
|
||||
</svelte:fragment>
|
||||
</AppBar>
|
||||
|
|
|
@ -1,20 +1,40 @@
|
|||
<script lang="ts">
|
||||
import { goto } from '$app/navigation';
|
||||
import summary from '$lib/shared/stores/summary';
|
||||
import Fa from 'svelte-fa/src/fa.svelte';
|
||||
import { faTrashCan } from '@fortawesome/free-regular-svg-icons';
|
||||
|
||||
let files: FileList;
|
||||
let isDisabled = true;
|
||||
let files: FileList | null;
|
||||
let url: string = '';
|
||||
let urlInput: HTMLInputElement;
|
||||
let fileForm: HTMLFormElement;
|
||||
let hasFile = true;
|
||||
|
||||
$: if (files && files.length >= 0) {
|
||||
isDisabled = false;
|
||||
hasFile = true;
|
||||
url = '';
|
||||
} else {
|
||||
isDisabled = true;
|
||||
hasFile = false;
|
||||
}
|
||||
|
||||
const removeFile = () => {
|
||||
fileForm.reset();
|
||||
hasFile = false;
|
||||
};
|
||||
|
||||
const submitUrl = async (url: string) => {
|
||||
const formData = new FormData();
|
||||
formData.append('url', url);
|
||||
return submit(formData);
|
||||
};
|
||||
|
||||
const submitFile = async (file: File) => {
|
||||
const formData = new FormData();
|
||||
formData.append('file', files[0]);
|
||||
formData.append('file', file);
|
||||
return submit(formData);
|
||||
};
|
||||
|
||||
const submit = async (formData: FormData) => {
|
||||
const response = await fetch('/', {
|
||||
method: 'POST',
|
||||
body: formData
|
||||
|
@ -24,12 +44,16 @@
|
|||
|
||||
const summarize = async () => {
|
||||
try {
|
||||
if (files && files.length >= 1) {
|
||||
const result = await submitFile(files[0]);
|
||||
console.log(result);
|
||||
summary.set(JSON.stringify(result.response));
|
||||
goto('chat');
|
||||
let result;
|
||||
if (hasFile) {
|
||||
if (!files) throw new Error('File missing');
|
||||
if (files.length <= 0) throw new Error('File missing');
|
||||
result = await submitFile(files[0]);
|
||||
} else {
|
||||
result = await submitUrl(url);
|
||||
}
|
||||
summary.set(JSON.stringify(result.response));
|
||||
return goto('chat');
|
||||
} catch (error) {
|
||||
alert((error as App.Error).message);
|
||||
console.error(`Error: ${JSON.stringify(error)}`);
|
||||
|
@ -40,13 +64,40 @@
|
|||
<div class="container h-full mx-auto flex justify-center items-center px-4">
|
||||
<div class="text-center">
|
||||
<h2>Please Sumi</h2>
|
||||
<p class="my-4">Select the text file you'd like to have summarized.</p>
|
||||
<input type="file" accept=".txt" name="conversation" bind:files />
|
||||
<p class="my-4">Enter the URL you'd like to have summarized.</p>
|
||||
<div>
|
||||
<input
|
||||
class="input"
|
||||
type="text"
|
||||
bind:value={url}
|
||||
bind:this={urlInput}
|
||||
disabled={hasFile}
|
||||
placeholder="https://stupid.article/you-will-never-believe-this-shit"
|
||||
/>
|
||||
</div>
|
||||
<p class="my-6">•</p>
|
||||
<div>
|
||||
<p>Or you can submit a text file instead:</p>
|
||||
<form bind:this={fileForm}>
|
||||
<div class="flex flex-row">
|
||||
<input class="input my-4" type="file" accept=".txt" name="conversation" bind:files />
|
||||
{#if hasFile}
|
||||
<button
|
||||
type="button"
|
||||
class="btn btn-icon-sm variant-ghost-error self-center"
|
||||
style="height: 100%;"
|
||||
on:click={removeFile}
|
||||
><Fa icon={faTrashCan} />
|
||||
</button>
|
||||
{/if}
|
||||
</div>
|
||||
</form>
|
||||
</div>
|
||||
<div class="flex justify-center space-x-2 my-12">
|
||||
<button
|
||||
type="button"
|
||||
class="btn variant-ringed-tertiary"
|
||||
disabled={isDisabled}
|
||||
disabled={!hasFile && url.length <= 0}
|
||||
on:click={summarize}>Summarize</button
|
||||
>
|
||||
</div>
|
||||
|
|
|
@ -3,14 +3,41 @@ import type { RequestHandler } from './$types';
|
|||
import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter';
|
||||
import { OpenAI } from 'langchain/llms';
|
||||
import { loadSummarizationChain } from 'langchain/chains';
|
||||
import { Readability } from '@mozilla/readability';
|
||||
|
||||
const parseSite = async (url: string) => {
|
||||
const jsdom = await import('jsdom');
|
||||
const { JSDOM } = jsdom;
|
||||
const response = await fetch(url, {
|
||||
method: 'GET',
|
||||
mode: 'cors',
|
||||
headers: {
|
||||
'Access-Control-Allow-Origin': '*'
|
||||
}
|
||||
});
|
||||
const html = await response.text();
|
||||
const dom = new JSDOM(html);
|
||||
const article = new Readability(dom.window.document).parse();
|
||||
return article?.textContent;
|
||||
};
|
||||
|
||||
export const POST = (async ({ request, url }) => {
|
||||
try {
|
||||
const form = await request.formData();
|
||||
|
||||
const url = form.get('url');
|
||||
const file = form.get('file');
|
||||
if (!file) throw new Error('No prompt found in the request.');
|
||||
const text = await (file as Blob).text();
|
||||
if (!file && !url) throw new Error('No prompt found in the request.');
|
||||
let text: string;
|
||||
if (url) {
|
||||
const article = await parseSite(url as string);
|
||||
if (!article) throw new Error('Could not parse site');
|
||||
text = article;
|
||||
} else if (file) {
|
||||
text = await (file as Blob).text();
|
||||
} else {
|
||||
throw new Error('Unknown error has occurred.');
|
||||
}
|
||||
|
||||
const model = new OpenAI({ temperature: 0 });
|
||||
const textSplitter = new RecursiveCharacterTextSplitter({ chunkSize: 1000 });
|
||||
|
|
|
@ -7,17 +7,17 @@
|
|||
require that you split the text and ask for ChatGPT to wait until the last prompt with the text
|
||||
before generating a summary.
|
||||
</p>
|
||||
<p class="font-sans my-4">
|
||||
<p class="font-sans">
|
||||
ChatGPT's web interface also can't be adjusted to lower the probability of the AI spouting
|
||||
unrelated nonsense. The technical term is "temperature", and the higher the temperature, the
|
||||
more randomness in the output.
|
||||
</p>
|
||||
<p class="font-sans">
|
||||
<p class="font-sans my-4">
|
||||
With Sumi, you can simply enter a URL or upload the entire document as a text file, and it will
|
||||
handle everything else.
|
||||
</p>
|
||||
<a
|
||||
class="btn variant-ringed-tertiary my-12"
|
||||
class="btn variant-ringed-primary my-12"
|
||||
href="https://git.silentsilas.com/silentsilas/Sumi"
|
||||
target="_blank"
|
||||
rel="noreferrer"
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
<script lang="ts">
|
||||
import { goto } from '$app/navigation';
|
||||
import prompt from '$lib/shared/stores/summary';
|
||||
let currentSummary: string;
|
||||
|
||||
|
@ -13,4 +14,5 @@
|
|||
>
|
||||
<h2 class="my-4">Summary</h2>
|
||||
<div>{currentSummary}</div>
|
||||
<button class="btn variant-ringed-primary my-12" on:click={() => goto('/')}>Go Back</button>
|
||||
</div>
|
||||
|
|
Loading…
Reference in New Issue