get urls working with mozilla's readability library
This commit is contained in:
parent
deb6052de8
commit
a2d0010a77
File diff suppressed because it is too large
Load Diff
|
@ -14,6 +14,7 @@
|
||||||
"format": "prettier --plugin-search-dir . --write ."
|
"format": "prettier --plugin-search-dir . --write ."
|
||||||
},
|
},
|
||||||
"devDependencies": {
|
"devDependencies": {
|
||||||
|
"@fortawesome/free-regular-svg-icons": "^6.4.0",
|
||||||
"@playwright/test": "^1.28.1",
|
"@playwright/test": "^1.28.1",
|
||||||
"@skeletonlabs/skeleton": "^1.1.0",
|
"@skeletonlabs/skeleton": "^1.1.0",
|
||||||
"@sveltejs/adapter-auto": "^2.0.0",
|
"@sveltejs/adapter-auto": "^2.0.0",
|
||||||
|
@ -21,6 +22,7 @@
|
||||||
"@sveltejs/kit": "^1.5.0",
|
"@sveltejs/kit": "^1.5.0",
|
||||||
"@tailwindcss/forms": "^0.5.3",
|
"@tailwindcss/forms": "^0.5.3",
|
||||||
"@tailwindcss/typography": "^0.5.9",
|
"@tailwindcss/typography": "^0.5.9",
|
||||||
|
"@types/jsdom": "^21.1.1",
|
||||||
"@typescript-eslint/eslint-plugin": "^5.45.0",
|
"@typescript-eslint/eslint-plugin": "^5.45.0",
|
||||||
"@typescript-eslint/parser": "^5.45.0",
|
"@typescript-eslint/parser": "^5.45.0",
|
||||||
"autoprefixer": "^10.4.14",
|
"autoprefixer": "^10.4.14",
|
||||||
|
@ -42,6 +44,9 @@
|
||||||
"type": "module",
|
"type": "module",
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"@dqbd/tiktoken": "^1.0.4",
|
"@dqbd/tiktoken": "^1.0.4",
|
||||||
"langchain": "^0.0.51"
|
"@mozilla/readability": "^0.4.4",
|
||||||
|
"jsdom": "^21.1.1",
|
||||||
|
"langchain": "^0.0.51",
|
||||||
|
"svelte-fa": "^3.0.3"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -7,3 +7,12 @@ declare namespace App {
|
||||||
// interface Error {}
|
// interface Error {}
|
||||||
// interface Platform {}
|
// interface Platform {}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
declare module '@fortawesome/free-regular-svg-icons/index.es' {
|
||||||
|
export * from '@fortawesome/free-regular-svg-icons';
|
||||||
|
}
|
||||||
|
|
||||||
|
declare module 'svelte-fa/src/fa.svelte' {
|
||||||
|
const Fa: any;
|
||||||
|
export default Fa;
|
||||||
|
}
|
||||||
|
|
|
@ -0,0 +1,144 @@
|
||||||
|
// Type definitions for non-npm package mozilla-readability 0.2
|
||||||
|
// Project: https://github.com/mozilla/readability
|
||||||
|
// Definitions by: Charles Vandevoorde <https://github.com/charlesvdv>, Alex Wendland <https://github.com/awendland>
|
||||||
|
// Definitions: https://github.com/DefinitelyTyped/DefinitelyTyped
|
||||||
|
// TypeScript Version: 2.2
|
||||||
|
declare module '@mozilla/readability' {
|
||||||
|
/**
|
||||||
|
* A standalone version of the readability library used for Firefox Reader View.
|
||||||
|
*
|
||||||
|
* Note that isProbablyReaderable() was moved into a separate file in https://github.com/mozilla/readability/commit/2620542dd1e8380220d82afa97a2c283ae636e40
|
||||||
|
* and therefore is no longer part of the Readability class.
|
||||||
|
*/
|
||||||
|
declare class Readability {
|
||||||
|
/**
|
||||||
|
* ## Usage on the web
|
||||||
|
*
|
||||||
|
* To parse a document, you must create a new Readability object from a
|
||||||
|
* DOM document object, and then call parse(). Here's an example:
|
||||||
|
*
|
||||||
|
* ```js
|
||||||
|
* var article = new Readability(document).parse();
|
||||||
|
* ```
|
||||||
|
*
|
||||||
|
* If you're using Readability on the web, you will likely be able to
|
||||||
|
* use a document reference from elsewhere (e.g. fetched via XMLHttpRequest,
|
||||||
|
* in a same-origin <iframe> you have access to, etc.).
|
||||||
|
*
|
||||||
|
* ## Usage from node.js
|
||||||
|
*
|
||||||
|
* In node.js, you won't generally have a DOM document object. To obtain one, you can use external
|
||||||
|
* libraries like [jsdom](https://github.com/tmpvar/jsdom). While this repository contains a parser of
|
||||||
|
* its own (`JSDOMParser`), that is restricted to reading XML-compatible markup and therefore we do
|
||||||
|
* not recommend it for general use.
|
||||||
|
*
|
||||||
|
* If you're using `jsdom` to create a DOM object, you should ensure that the page doesn't run (page)
|
||||||
|
* scripts (avoid fetching remote resources etc.) as well as passing it the page's URI as the `url`
|
||||||
|
* property of the `options` object you pass the `JSDOM` constructor.
|
||||||
|
*
|
||||||
|
* ```js
|
||||||
|
* var JSDOM = require('jsdom').JSDOM;
|
||||||
|
* var doc = new JSDOM("<body>Here's a bunch of text</body>", {
|
||||||
|
* url: "https://www.example.com/the-page-i-got-the-source-from",
|
||||||
|
* });
|
||||||
|
* let reader = new Readability(doc.window.document);
|
||||||
|
* let article = reader.parse();
|
||||||
|
* ```
|
||||||
|
*/
|
||||||
|
constructor(doc: Document, options?: Readability.Options);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Runs readability.
|
||||||
|
*
|
||||||
|
* ## Workflow:
|
||||||
|
*
|
||||||
|
* 1. Prep the document by removing script tags, css, etc.
|
||||||
|
* 2. Build readability's DOM tree.
|
||||||
|
* 3. Grab the article content from the current dom tree.
|
||||||
|
* 4. Replace the current DOM tree with the new one.
|
||||||
|
* 5. Read peacefully.
|
||||||
|
*
|
||||||
|
* ## Additional notes:
|
||||||
|
*
|
||||||
|
* Readability's parse() works by modifying the DOM. This removes some
|
||||||
|
* elements in the web page. You could avoid this by passing the clone
|
||||||
|
* of the document object while creating a Readability object.
|
||||||
|
*
|
||||||
|
* ```js
|
||||||
|
* var documentClone = document.cloneNode(true);
|
||||||
|
* var article = new Readability(documentClone).parse();
|
||||||
|
* ```
|
||||||
|
*
|
||||||
|
* The response will be null if the processing failed (https://github.com/mozilla/readability/blob/52ab9b5c8916c306a47b2119270dcdabebf9d203/Readability.js#L2038)
|
||||||
|
*/
|
||||||
|
parse(): Readability.ParseResult | null;
|
||||||
|
}
|
||||||
|
|
||||||
|
declare namespace Readability {
|
||||||
|
interface Options {
|
||||||
|
/**
|
||||||
|
* Control whether log messages are sent to the console
|
||||||
|
*/
|
||||||
|
debug?: boolean | undefined;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Set a maximum size on the documents that will be processed. This size is
|
||||||
|
* checked before any parsing operations occur. If the number of elements in
|
||||||
|
* the document exceeds this threshold then an Error will be thrown.
|
||||||
|
*
|
||||||
|
* See implementation details at https://github.com/mozilla/readability/blob/52ab9b5c8916c306a47b2119270dcdabebf9d203/Readability.js#L2019
|
||||||
|
*/
|
||||||
|
maxElemsToParse?: number | undefined;
|
||||||
|
|
||||||
|
nbTopCandidates?: number | undefined;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Minimum number of characters in the extracted textContent in order to
|
||||||
|
* consider the article correctly identified. If the threshold is not met then
|
||||||
|
* the extraction process will automatically run again with different flags.
|
||||||
|
*
|
||||||
|
* See implementation details at https://github.com/mozilla/readability/blob/52ab9b5c8916c306a47b2119270dcdabebf9d203/Readability.js#L1208
|
||||||
|
*
|
||||||
|
* Changed from wordThreshold in https://github.com/mozilla/readability/commit/3ff9a166fb27928f222c4c0722e730eda412658a
|
||||||
|
*/
|
||||||
|
charThreshold?: number | undefined;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* parse() removes the class="" attribute from every element in the given
|
||||||
|
* subtree, except those that match CLASSES_TO_PRESERVE and
|
||||||
|
* the classesToPreserve array from the options object.
|
||||||
|
*/
|
||||||
|
classesToPreserve?: string[] | undefined;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* By default Readability will strip all classes from the HTML elements in the
|
||||||
|
* processed article. By setting this to `true` the classes will be retained.
|
||||||
|
*
|
||||||
|
* This is a blanket alternative to `classesToPreserve`.
|
||||||
|
*
|
||||||
|
* Added in https://github.com/mozilla/readability/commit/2982216913af2c66b0690e88606b03116553ad92
|
||||||
|
*/
|
||||||
|
|
||||||
|
keepClasses?: boolean | undefined;
|
||||||
|
}
|
||||||
|
|
||||||
|
interface ParseResult {
|
||||||
|
/** Article title */
|
||||||
|
title: string;
|
||||||
|
/** Author metadata */
|
||||||
|
byline: string;
|
||||||
|
/** Content direction */
|
||||||
|
dir: string;
|
||||||
|
/** HTML string of processed article content */
|
||||||
|
content: string;
|
||||||
|
/** non-HTML version of `content` */
|
||||||
|
textContent: string;
|
||||||
|
/** Length of an article, in characters */
|
||||||
|
length: number;
|
||||||
|
/** Article description, or short excerpt from the content */
|
||||||
|
excerpt: string;
|
||||||
|
/** Article site name */
|
||||||
|
siteName: string;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -14,10 +14,9 @@
|
||||||
<!-- App Bar -->
|
<!-- App Bar -->
|
||||||
<AppBar>
|
<AppBar>
|
||||||
<svelte:fragment slot="lead">
|
<svelte:fragment slot="lead">
|
||||||
<strong class="text-xl uppercase">Sumi Ai</strong>
|
<a href="/"><strong class="text-xl uppercase">Sumi Ai</strong></a>
|
||||||
</svelte:fragment>
|
</svelte:fragment>
|
||||||
<svelte:fragment slot="trail">
|
<svelte:fragment slot="trail">
|
||||||
<a class="btn btn-sm variant-ringed-tertiary" href="/"> Home </a>
|
|
||||||
<a class="btn btn-sm variant-ringed-tertiary" href="/about"> About </a>
|
<a class="btn btn-sm variant-ringed-tertiary" href="/about"> About </a>
|
||||||
</svelte:fragment>
|
</svelte:fragment>
|
||||||
</AppBar>
|
</AppBar>
|
||||||
|
|
|
@ -1,20 +1,40 @@
|
||||||
<script lang="ts">
|
<script lang="ts">
|
||||||
import { goto } from '$app/navigation';
|
import { goto } from '$app/navigation';
|
||||||
import summary from '$lib/shared/stores/summary';
|
import summary from '$lib/shared/stores/summary';
|
||||||
|
import Fa from 'svelte-fa/src/fa.svelte';
|
||||||
|
import { faTrashCan } from '@fortawesome/free-regular-svg-icons';
|
||||||
|
|
||||||
let files: FileList;
|
let files: FileList | null;
|
||||||
let isDisabled = true;
|
let url: string = '';
|
||||||
|
let urlInput: HTMLInputElement;
|
||||||
|
let fileForm: HTMLFormElement;
|
||||||
|
let hasFile = true;
|
||||||
|
|
||||||
$: if (files && files.length >= 0) {
|
$: if (files && files.length >= 0) {
|
||||||
isDisabled = false;
|
hasFile = true;
|
||||||
|
url = '';
|
||||||
} else {
|
} else {
|
||||||
isDisabled = true;
|
hasFile = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const removeFile = () => {
|
||||||
|
fileForm.reset();
|
||||||
|
hasFile = false;
|
||||||
|
};
|
||||||
|
|
||||||
|
const submitUrl = async (url: string) => {
|
||||||
|
const formData = new FormData();
|
||||||
|
formData.append('url', url);
|
||||||
|
return submit(formData);
|
||||||
|
};
|
||||||
|
|
||||||
const submitFile = async (file: File) => {
|
const submitFile = async (file: File) => {
|
||||||
const formData = new FormData();
|
const formData = new FormData();
|
||||||
formData.append('file', files[0]);
|
formData.append('file', file);
|
||||||
|
return submit(formData);
|
||||||
|
};
|
||||||
|
|
||||||
|
const submit = async (formData: FormData) => {
|
||||||
const response = await fetch('/', {
|
const response = await fetch('/', {
|
||||||
method: 'POST',
|
method: 'POST',
|
||||||
body: formData
|
body: formData
|
||||||
|
@ -24,12 +44,16 @@
|
||||||
|
|
||||||
const summarize = async () => {
|
const summarize = async () => {
|
||||||
try {
|
try {
|
||||||
if (files && files.length >= 1) {
|
let result;
|
||||||
const result = await submitFile(files[0]);
|
if (hasFile) {
|
||||||
console.log(result);
|
if (!files) throw new Error('File missing');
|
||||||
summary.set(JSON.stringify(result.response));
|
if (files.length <= 0) throw new Error('File missing');
|
||||||
goto('chat');
|
result = await submitFile(files[0]);
|
||||||
|
} else {
|
||||||
|
result = await submitUrl(url);
|
||||||
}
|
}
|
||||||
|
summary.set(JSON.stringify(result.response));
|
||||||
|
return goto('chat');
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
alert((error as App.Error).message);
|
alert((error as App.Error).message);
|
||||||
console.error(`Error: ${JSON.stringify(error)}`);
|
console.error(`Error: ${JSON.stringify(error)}`);
|
||||||
|
@ -40,13 +64,40 @@
|
||||||
<div class="container h-full mx-auto flex justify-center items-center px-4">
|
<div class="container h-full mx-auto flex justify-center items-center px-4">
|
||||||
<div class="text-center">
|
<div class="text-center">
|
||||||
<h2>Please Sumi</h2>
|
<h2>Please Sumi</h2>
|
||||||
<p class="my-4">Select the text file you'd like to have summarized.</p>
|
<p class="my-4">Enter the URL you'd like to have summarized.</p>
|
||||||
<input type="file" accept=".txt" name="conversation" bind:files />
|
<div>
|
||||||
|
<input
|
||||||
|
class="input"
|
||||||
|
type="text"
|
||||||
|
bind:value={url}
|
||||||
|
bind:this={urlInput}
|
||||||
|
disabled={hasFile}
|
||||||
|
placeholder="https://stupid.article/you-will-never-believe-this-shit"
|
||||||
|
/>
|
||||||
|
</div>
|
||||||
|
<p class="my-6">•</p>
|
||||||
|
<div>
|
||||||
|
<p>Or you can submit a text file instead:</p>
|
||||||
|
<form bind:this={fileForm}>
|
||||||
|
<div class="flex flex-row">
|
||||||
|
<input class="input my-4" type="file" accept=".txt" name="conversation" bind:files />
|
||||||
|
{#if hasFile}
|
||||||
|
<button
|
||||||
|
type="button"
|
||||||
|
class="btn btn-icon-sm variant-ghost-error self-center"
|
||||||
|
style="height: 100%;"
|
||||||
|
on:click={removeFile}
|
||||||
|
><Fa icon={faTrashCan} />
|
||||||
|
</button>
|
||||||
|
{/if}
|
||||||
|
</div>
|
||||||
|
</form>
|
||||||
|
</div>
|
||||||
<div class="flex justify-center space-x-2 my-12">
|
<div class="flex justify-center space-x-2 my-12">
|
||||||
<button
|
<button
|
||||||
type="button"
|
type="button"
|
||||||
class="btn variant-ringed-tertiary"
|
class="btn variant-ringed-tertiary"
|
||||||
disabled={isDisabled}
|
disabled={!hasFile && url.length <= 0}
|
||||||
on:click={summarize}>Summarize</button
|
on:click={summarize}>Summarize</button
|
||||||
>
|
>
|
||||||
</div>
|
</div>
|
||||||
|
|
|
@ -3,14 +3,41 @@ import type { RequestHandler } from './$types';
|
||||||
import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter';
|
import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter';
|
||||||
import { OpenAI } from 'langchain/llms';
|
import { OpenAI } from 'langchain/llms';
|
||||||
import { loadSummarizationChain } from 'langchain/chains';
|
import { loadSummarizationChain } from 'langchain/chains';
|
||||||
|
import { Readability } from '@mozilla/readability';
|
||||||
|
|
||||||
|
const parseSite = async (url: string) => {
|
||||||
|
const jsdom = await import('jsdom');
|
||||||
|
const { JSDOM } = jsdom;
|
||||||
|
const response = await fetch(url, {
|
||||||
|
method: 'GET',
|
||||||
|
mode: 'cors',
|
||||||
|
headers: {
|
||||||
|
'Access-Control-Allow-Origin': '*'
|
||||||
|
}
|
||||||
|
});
|
||||||
|
const html = await response.text();
|
||||||
|
const dom = new JSDOM(html);
|
||||||
|
const article = new Readability(dom.window.document).parse();
|
||||||
|
return article?.textContent;
|
||||||
|
};
|
||||||
|
|
||||||
export const POST = (async ({ request, url }) => {
|
export const POST = (async ({ request, url }) => {
|
||||||
try {
|
try {
|
||||||
const form = await request.formData();
|
const form = await request.formData();
|
||||||
|
|
||||||
|
const url = form.get('url');
|
||||||
const file = form.get('file');
|
const file = form.get('file');
|
||||||
if (!file) throw new Error('No prompt found in the request.');
|
if (!file && !url) throw new Error('No prompt found in the request.');
|
||||||
const text = await (file as Blob).text();
|
let text: string;
|
||||||
|
if (url) {
|
||||||
|
const article = await parseSite(url as string);
|
||||||
|
if (!article) throw new Error('Could not parse site');
|
||||||
|
text = article;
|
||||||
|
} else if (file) {
|
||||||
|
text = await (file as Blob).text();
|
||||||
|
} else {
|
||||||
|
throw new Error('Unknown error has occurred.');
|
||||||
|
}
|
||||||
|
|
||||||
const model = new OpenAI({ temperature: 0 });
|
const model = new OpenAI({ temperature: 0 });
|
||||||
const textSplitter = new RecursiveCharacterTextSplitter({ chunkSize: 1000 });
|
const textSplitter = new RecursiveCharacterTextSplitter({ chunkSize: 1000 });
|
||||||
|
|
|
@ -7,17 +7,17 @@
|
||||||
require that you split the text and ask for ChatGPT to wait until the last prompt with the text
|
require that you split the text and ask for ChatGPT to wait until the last prompt with the text
|
||||||
before generating a summary.
|
before generating a summary.
|
||||||
</p>
|
</p>
|
||||||
<p class="font-sans my-4">
|
<p class="font-sans">
|
||||||
ChatGPT's web interface also can't be adjusted to lower the probability of the AI spouting
|
ChatGPT's web interface also can't be adjusted to lower the probability of the AI spouting
|
||||||
unrelated nonsense. The technical term is "temperature", and the higher the temperature, the
|
unrelated nonsense. The technical term is "temperature", and the higher the temperature, the
|
||||||
more randomness in the output.
|
more randomness in the output.
|
||||||
</p>
|
</p>
|
||||||
<p class="font-sans">
|
<p class="font-sans my-4">
|
||||||
With Sumi, you can simply enter a URL or upload the entire document as a text file, and it will
|
With Sumi, you can simply enter a URL or upload the entire document as a text file, and it will
|
||||||
handle everything else.
|
handle everything else.
|
||||||
</p>
|
</p>
|
||||||
<a
|
<a
|
||||||
class="btn variant-ringed-tertiary my-12"
|
class="btn variant-ringed-primary my-12"
|
||||||
href="https://git.silentsilas.com/silentsilas/Sumi"
|
href="https://git.silentsilas.com/silentsilas/Sumi"
|
||||||
target="_blank"
|
target="_blank"
|
||||||
rel="noreferrer"
|
rel="noreferrer"
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
<script lang="ts">
|
<script lang="ts">
|
||||||
|
import { goto } from '$app/navigation';
|
||||||
import prompt from '$lib/shared/stores/summary';
|
import prompt from '$lib/shared/stores/summary';
|
||||||
let currentSummary: string;
|
let currentSummary: string;
|
||||||
|
|
||||||
|
@ -13,4 +14,5 @@
|
||||||
>
|
>
|
||||||
<h2 class="my-4">Summary</h2>
|
<h2 class="my-4">Summary</h2>
|
||||||
<div>{currentSummary}</div>
|
<div>{currentSummary}</div>
|
||||||
|
<button class="btn variant-ringed-primary my-12" on:click={() => goto('/')}>Go Back</button>
|
||||||
</div>
|
</div>
|
||||||
|
|
Loading…
Reference in New Issue