get urls working with mozilla's readability library

This commit is contained in:
Silas 2023-04-09 17:50:42 -04:00
parent deb6052de8
commit a2d0010a77
Signed by: silentsilas
GPG Key ID: 4199EFB7DAA34349
9 changed files with 1182 additions and 57 deletions

960
package-lock.json generated

File diff suppressed because it is too large Load Diff

View File

@ -14,6 +14,7 @@
"format": "prettier --plugin-search-dir . --write ." "format": "prettier --plugin-search-dir . --write ."
}, },
"devDependencies": { "devDependencies": {
"@fortawesome/free-regular-svg-icons": "^6.4.0",
"@playwright/test": "^1.28.1", "@playwright/test": "^1.28.1",
"@skeletonlabs/skeleton": "^1.1.0", "@skeletonlabs/skeleton": "^1.1.0",
"@sveltejs/adapter-auto": "^2.0.0", "@sveltejs/adapter-auto": "^2.0.0",
@ -21,6 +22,7 @@
"@sveltejs/kit": "^1.5.0", "@sveltejs/kit": "^1.5.0",
"@tailwindcss/forms": "^0.5.3", "@tailwindcss/forms": "^0.5.3",
"@tailwindcss/typography": "^0.5.9", "@tailwindcss/typography": "^0.5.9",
"@types/jsdom": "^21.1.1",
"@typescript-eslint/eslint-plugin": "^5.45.0", "@typescript-eslint/eslint-plugin": "^5.45.0",
"@typescript-eslint/parser": "^5.45.0", "@typescript-eslint/parser": "^5.45.0",
"autoprefixer": "^10.4.14", "autoprefixer": "^10.4.14",
@ -42,6 +44,9 @@
"type": "module", "type": "module",
"dependencies": { "dependencies": {
"@dqbd/tiktoken": "^1.0.4", "@dqbd/tiktoken": "^1.0.4",
"langchain": "^0.0.51" "@mozilla/readability": "^0.4.4",
"jsdom": "^21.1.1",
"langchain": "^0.0.51",
"svelte-fa": "^3.0.3"
} }
} }

9
src/app.d.ts vendored
View File

@ -7,3 +7,12 @@ declare namespace App {
// interface Error {} // interface Error {}
// interface Platform {} // interface Platform {}
} }
declare module '@fortawesome/free-regular-svg-icons/index.es' {
export * from '@fortawesome/free-regular-svg-icons';
}
declare module 'svelte-fa/src/fa.svelte' {
const Fa: any;
export default Fa;
}

144
src/lib/types/@mozilla/readability.d.ts vendored Normal file
View File

@ -0,0 +1,144 @@
// Type definitions for non-npm package mozilla-readability 0.2
// Project: https://github.com/mozilla/readability
// Definitions by: Charles Vandevoorde <https://github.com/charlesvdv>, Alex Wendland <https://github.com/awendland>
// Definitions: https://github.com/DefinitelyTyped/DefinitelyTyped
// TypeScript Version: 2.2
declare module '@mozilla/readability' {
/**
* A standalone version of the readability library used for Firefox Reader View.
*
* Note that isProbablyReaderable() was moved into a separate file in https://github.com/mozilla/readability/commit/2620542dd1e8380220d82afa97a2c283ae636e40
* and therefore is no longer part of the Readability class.
*/
declare class Readability {
/**
* ## Usage on the web
*
* To parse a document, you must create a new Readability object from a
* DOM document object, and then call parse(). Here's an example:
*
* ```js
* var article = new Readability(document).parse();
* ```
*
* If you're using Readability on the web, you will likely be able to
* use a document reference from elsewhere (e.g. fetched via XMLHttpRequest,
* in a same-origin <iframe> you have access to, etc.).
*
* ## Usage from node.js
*
* In node.js, you won't generally have a DOM document object. To obtain one, you can use external
* libraries like [jsdom](https://github.com/tmpvar/jsdom). While this repository contains a parser of
* its own (`JSDOMParser`), that is restricted to reading XML-compatible markup and therefore we do
* not recommend it for general use.
*
* If you're using `jsdom` to create a DOM object, you should ensure that the page doesn't run (page)
* scripts (avoid fetching remote resources etc.) as well as passing it the page's URI as the `url`
* property of the `options` object you pass the `JSDOM` constructor.
*
* ```js
* var JSDOM = require('jsdom').JSDOM;
* var doc = new JSDOM("<body>Here's a bunch of text</body>", {
* url: "https://www.example.com/the-page-i-got-the-source-from",
* });
* let reader = new Readability(doc.window.document);
* let article = reader.parse();
* ```
*/
constructor(doc: Document, options?: Readability.Options);
/**
* Runs readability.
*
* ## Workflow:
*
* 1. Prep the document by removing script tags, css, etc.
* 2. Build readability's DOM tree.
* 3. Grab the article content from the current dom tree.
* 4. Replace the current DOM tree with the new one.
* 5. Read peacefully.
*
* ## Additional notes:
*
* Readability's parse() works by modifying the DOM. This removes some
* elements in the web page. You could avoid this by passing the clone
* of the document object while creating a Readability object.
*
* ```js
* var documentClone = document.cloneNode(true);
* var article = new Readability(documentClone).parse();
* ```
*
* The response will be null if the processing failed (https://github.com/mozilla/readability/blob/52ab9b5c8916c306a47b2119270dcdabebf9d203/Readability.js#L2038)
*/
parse(): Readability.ParseResult | null;
}
declare namespace Readability {
interface Options {
/**
* Control whether log messages are sent to the console
*/
debug?: boolean | undefined;
/**
* Set a maximum size on the documents that will be processed. This size is
* checked before any parsing operations occur. If the number of elements in
* the document exceeds this threshold then an Error will be thrown.
*
* See implementation details at https://github.com/mozilla/readability/blob/52ab9b5c8916c306a47b2119270dcdabebf9d203/Readability.js#L2019
*/
maxElemsToParse?: number | undefined;
nbTopCandidates?: number | undefined;
/**
* Minimum number of characters in the extracted textContent in order to
* consider the article correctly identified. If the threshold is not met then
* the extraction process will automatically run again with different flags.
*
* See implementation details at https://github.com/mozilla/readability/blob/52ab9b5c8916c306a47b2119270dcdabebf9d203/Readability.js#L1208
*
* Changed from wordThreshold in https://github.com/mozilla/readability/commit/3ff9a166fb27928f222c4c0722e730eda412658a
*/
charThreshold?: number | undefined;
/**
* parse() removes the class="" attribute from every element in the given
* subtree, except those that match CLASSES_TO_PRESERVE and
* the classesToPreserve array from the options object.
*/
classesToPreserve?: string[] | undefined;
/**
* By default Readability will strip all classes from the HTML elements in the
* processed article. By setting this to `true` the classes will be retained.
*
* This is a blanket alternative to `classesToPreserve`.
*
* Added in https://github.com/mozilla/readability/commit/2982216913af2c66b0690e88606b03116553ad92
*/
keepClasses?: boolean | undefined;
}
interface ParseResult {
/** Article title */
title: string;
/** Author metadata */
byline: string;
/** Content direction */
dir: string;
/** HTML string of processed article content */
content: string;
/** non-HTML version of `content` */
textContent: string;
/** Length of an article, in characters */
length: number;
/** Article description, or short excerpt from the content */
excerpt: string;
/** Article site name */
siteName: string;
}
}
}

View File

@ -14,10 +14,9 @@
<!-- App Bar --> <!-- App Bar -->
<AppBar> <AppBar>
<svelte:fragment slot="lead"> <svelte:fragment slot="lead">
<strong class="text-xl uppercase">Sumi Ai</strong> <a href="/"><strong class="text-xl uppercase">Sumi Ai</strong></a>
</svelte:fragment> </svelte:fragment>
<svelte:fragment slot="trail"> <svelte:fragment slot="trail">
<a class="btn btn-sm variant-ringed-tertiary" href="/"> Home </a>
<a class="btn btn-sm variant-ringed-tertiary" href="/about"> About </a> <a class="btn btn-sm variant-ringed-tertiary" href="/about"> About </a>
</svelte:fragment> </svelte:fragment>
</AppBar> </AppBar>

View File

@ -1,20 +1,40 @@
<script lang="ts"> <script lang="ts">
import { goto } from '$app/navigation'; import { goto } from '$app/navigation';
import summary from '$lib/shared/stores/summary'; import summary from '$lib/shared/stores/summary';
import Fa from 'svelte-fa/src/fa.svelte';
import { faTrashCan } from '@fortawesome/free-regular-svg-icons';
let files: FileList; let files: FileList | null;
let isDisabled = true; let url: string = '';
let urlInput: HTMLInputElement;
let fileForm: HTMLFormElement;
let hasFile = true;
$: if (files && files.length >= 0) { $: if (files && files.length >= 0) {
isDisabled = false; hasFile = true;
url = '';
} else { } else {
isDisabled = true; hasFile = false;
} }
const removeFile = () => {
fileForm.reset();
hasFile = false;
};
const submitUrl = async (url: string) => {
const formData = new FormData();
formData.append('url', url);
return submit(formData);
};
const submitFile = async (file: File) => { const submitFile = async (file: File) => {
const formData = new FormData(); const formData = new FormData();
formData.append('file', files[0]); formData.append('file', file);
return submit(formData);
};
const submit = async (formData: FormData) => {
const response = await fetch('/', { const response = await fetch('/', {
method: 'POST', method: 'POST',
body: formData body: formData
@ -24,12 +44,16 @@
const summarize = async () => { const summarize = async () => {
try { try {
if (files && files.length >= 1) { let result;
const result = await submitFile(files[0]); if (hasFile) {
console.log(result); if (!files) throw new Error('File missing');
summary.set(JSON.stringify(result.response)); if (files.length <= 0) throw new Error('File missing');
goto('chat'); result = await submitFile(files[0]);
} else {
result = await submitUrl(url);
} }
summary.set(JSON.stringify(result.response));
return goto('chat');
} catch (error) { } catch (error) {
alert((error as App.Error).message); alert((error as App.Error).message);
console.error(`Error: ${JSON.stringify(error)}`); console.error(`Error: ${JSON.stringify(error)}`);
@ -40,13 +64,40 @@
<div class="container h-full mx-auto flex justify-center items-center px-4"> <div class="container h-full mx-auto flex justify-center items-center px-4">
<div class="text-center"> <div class="text-center">
<h2>Please Sumi</h2> <h2>Please Sumi</h2>
<p class="my-4">Select the text file you'd like to have summarized.</p> <p class="my-4">Enter the URL you'd like to have summarized.</p>
<input type="file" accept=".txt" name="conversation" bind:files /> <div>
<input
class="input"
type="text"
bind:value={url}
bind:this={urlInput}
disabled={hasFile}
placeholder="https://stupid.article/you-will-never-believe-this-shit"
/>
</div>
<p class="my-6"></p>
<div>
<p>Or you can submit a text file instead:</p>
<form bind:this={fileForm}>
<div class="flex flex-row">
<input class="input my-4" type="file" accept=".txt" name="conversation" bind:files />
{#if hasFile}
<button
type="button"
class="btn btn-icon-sm variant-ghost-error self-center"
style="height: 100%;"
on:click={removeFile}
><Fa icon={faTrashCan} />
</button>
{/if}
</div>
</form>
</div>
<div class="flex justify-center space-x-2 my-12"> <div class="flex justify-center space-x-2 my-12">
<button <button
type="button" type="button"
class="btn variant-ringed-tertiary" class="btn variant-ringed-tertiary"
disabled={isDisabled} disabled={!hasFile && url.length <= 0}
on:click={summarize}>Summarize</button on:click={summarize}>Summarize</button
> >
</div> </div>

View File

@ -3,14 +3,41 @@ import type { RequestHandler } from './$types';
import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter'; import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter';
import { OpenAI } from 'langchain/llms'; import { OpenAI } from 'langchain/llms';
import { loadSummarizationChain } from 'langchain/chains'; import { loadSummarizationChain } from 'langchain/chains';
import { Readability } from '@mozilla/readability';
const parseSite = async (url: string) => {
const jsdom = await import('jsdom');
const { JSDOM } = jsdom;
const response = await fetch(url, {
method: 'GET',
mode: 'cors',
headers: {
'Access-Control-Allow-Origin': '*'
}
});
const html = await response.text();
const dom = new JSDOM(html);
const article = new Readability(dom.window.document).parse();
return article?.textContent;
};
export const POST = (async ({ request, url }) => { export const POST = (async ({ request, url }) => {
try { try {
const form = await request.formData(); const form = await request.formData();
const url = form.get('url');
const file = form.get('file'); const file = form.get('file');
if (!file) throw new Error('No prompt found in the request.'); if (!file && !url) throw new Error('No prompt found in the request.');
const text = await (file as Blob).text(); let text: string;
if (url) {
const article = await parseSite(url as string);
if (!article) throw new Error('Could not parse site');
text = article;
} else if (file) {
text = await (file as Blob).text();
} else {
throw new Error('Unknown error has occurred.');
}
const model = new OpenAI({ temperature: 0 }); const model = new OpenAI({ temperature: 0 });
const textSplitter = new RecursiveCharacterTextSplitter({ chunkSize: 1000 }); const textSplitter = new RecursiveCharacterTextSplitter({ chunkSize: 1000 });

View File

@ -7,17 +7,17 @@
require that you split the text and ask for ChatGPT to wait until the last prompt with the text require that you split the text and ask for ChatGPT to wait until the last prompt with the text
before generating a summary. before generating a summary.
</p> </p>
<p class="font-sans my-4"> <p class="font-sans">
ChatGPT's web interface also can't be adjusted to lower the probability of the AI spouting ChatGPT's web interface also can't be adjusted to lower the probability of the AI spouting
unrelated nonsense. The technical term is "temperature", and the higher the temperature, the unrelated nonsense. The technical term is "temperature", and the higher the temperature, the
more randomness in the output. more randomness in the output.
</p> </p>
<p class="font-sans"> <p class="font-sans my-4">
With Sumi, you can simply enter a URL or upload the entire document as a text file, and it will With Sumi, you can simply enter a URL or upload the entire document as a text file, and it will
handle everything else. handle everything else.
</p> </p>
<a <a
class="btn variant-ringed-tertiary my-12" class="btn variant-ringed-primary my-12"
href="https://git.silentsilas.com/silentsilas/Sumi" href="https://git.silentsilas.com/silentsilas/Sumi"
target="_blank" target="_blank"
rel="noreferrer" rel="noreferrer"

View File

@ -1,4 +1,5 @@
<script lang="ts"> <script lang="ts">
import { goto } from '$app/navigation';
import prompt from '$lib/shared/stores/summary'; import prompt from '$lib/shared/stores/summary';
let currentSummary: string; let currentSummary: string;
@ -13,4 +14,5 @@
> >
<h2 class="my-4">Summary</h2> <h2 class="my-4">Summary</h2>
<div>{currentSummary}</div> <div>{currentSummary}</div>
<button class="btn variant-ringed-primary my-12" on:click={() => goto('/')}>Go Back</button>
</div> </div>