cleaner output from web scrapes

This commit is contained in:
Raven Scott 2023-05-05 18:31:24 +02:00
parent 57545a20ef
commit 18923b7909

View File

@ -214,16 +214,20 @@ async function generateResponse(conversation, message) {
} }
if (pageContent) { if (pageContent) {
const MAX_CONTENT_LENGTH = process.env.MAX_CONTENT_LENGTH; const MAX_CONTENT_LENGTH = process.env.MAX_CONTENT_LENGTH;
let plainTextContent = $('<div>').html(pageContent).text().trim().replace(/[\r\n\t]+/g, ' '); let plainTextContent = $('<div>').html(pageContent).text().trim().replace(/[\r\n]+/g, ' ');
const codePattern = /\/\/|\/\*|\*\/|\{|\}|\[|\]|\bfunction\b|\bclass\b|\b0x[0-9A-Fa-f]+\b|\b0b[01]+\b/;
const isCode = codePattern.test(plainTextContent);
if (isCode) {
plainTextContent = plainTextContent.replace(codePattern, '');
}
// Remove anything enclosed in brackets
plainTextContent = plainTextContent.replace(/ *\([^)]*\) */g, '');
if (plainTextContent.length > MAX_CONTENT_LENGTH) { if (plainTextContent.length > MAX_CONTENT_LENGTH) {
plainTextContent = plainTextContent.substring(0, MAX_CONTENT_LENGTH) + '...'; plainTextContent = plainTextContent.substring(0, MAX_CONTENT_LENGTH) + '...';
response += `Content: ${plainTextContent.trim()}`;
} else {
response += `Content: ${plainTextContent.trim()}`;
} }
response += `Content: ${plainTextContent.trim().replace(/[\n\r]+| +/g, ' ')}`;
} }
response += `URL: ${url}`; response += `URL: ${url}`;
// Append bot message to conversation history // Append bot message to conversation history