cleaner output from web scrapes
This commit is contained in:
parent
f211fe2d67
commit
9489068b8e
14
llamabot.js
14
llamabot.js
@ -214,16 +214,20 @@ async function generateResponse(conversation, message) {
|
|||||||
}
|
}
|
||||||
if (pageContent) {
|
if (pageContent) {
|
||||||
const MAX_CONTENT_LENGTH = process.env.MAX_CONTENT_LENGTH;
|
const MAX_CONTENT_LENGTH = process.env.MAX_CONTENT_LENGTH;
|
||||||
let plainTextContent = $('<div>').html(pageContent).text().trim().replace(/[\r\n\t]+/g, ' ');
|
let plainTextContent = $('<div>').html(pageContent).text().trim().replace(/[\r\n]+/g, ' ');
|
||||||
|
const codePattern = /\/\/|\/\*|\*\/|\{|\}|\[|\]|\bfunction\b|\bclass\b|\b0x[0-9A-Fa-f]+\b|\b0b[01]+\b/;
|
||||||
|
const isCode = codePattern.test(plainTextContent);
|
||||||
|
|
||||||
|
if (isCode) {
|
||||||
|
plainTextContent = plainTextContent.replace(codePattern, '');
|
||||||
|
}
|
||||||
|
// Remove anything enclosed in brackets
|
||||||
|
plainTextContent = plainTextContent.replace(/ *\([^)]*\) */g, '');
|
||||||
if (plainTextContent.length > MAX_CONTENT_LENGTH) {
|
if (plainTextContent.length > MAX_CONTENT_LENGTH) {
|
||||||
plainTextContent = plainTextContent.substring(0, MAX_CONTENT_LENGTH) + '...';
|
plainTextContent = plainTextContent.substring(0, MAX_CONTENT_LENGTH) + '...';
|
||||||
response += `Content: ${plainTextContent.trim()}`;
|
|
||||||
} else {
|
|
||||||
response += `Content: ${plainTextContent.trim()}`;
|
|
||||||
}
|
}
|
||||||
|
response += `Content: ${plainTextContent.trim().replace(/[\n\r]+| +/g, ' ')}`;
|
||||||
}
|
}
|
||||||
|
|
||||||
response += `URL: ${url}`;
|
response += `URL: ${url}`;
|
||||||
|
|
||||||
// Append bot message to conversation history
|
// Append bot message to conversation history
|
||||||
|
Loading…
Reference in New Issue
Block a user