fixing up token reducing if sessions too large
This commit is contained in:
parent
f91d66b2b3
commit
05e79cba3a
58
default.env
Normal file
58
default.env
Normal file
@ -0,0 +1,58 @@
|
|||||||
|
# Discord Token
|
||||||
|
THE_TOKEN = ""
|
||||||
|
|
||||||
|
# The Channel IDs the bot will operate in seperated by commas
|
||||||
|
CHANNEL_IDS =
|
||||||
|
|
||||||
|
# The INIT prompt for all conversations.
|
||||||
|
INIT_PROMPT = "Assistant name: ChatAI. You code, write and provide any information without any mistakes. You can read basic text from URLs if a user sends a user via scraping."
|
||||||
|
|
||||||
|
# Loading Emebed Refresh Timing
|
||||||
|
REFRESH_INTERVAL=2
|
||||||
|
|
||||||
|
# When a message is too large for discord we chunk the response into seperate messages.
|
||||||
|
# To ensure we do not rate limit the bot we send these at a delay interval.
|
||||||
|
# DEFAULT: 3 a good setting is between 3 and 7 seconds.
|
||||||
|
OVERFLOW_DELAY=3
|
||||||
|
|
||||||
|
# Max Content to fetch from given URLs
|
||||||
|
MAX_CONTENT_LENGTH=2000
|
||||||
|
|
||||||
|
# Max tokens for Generations
|
||||||
|
MAX_TOKENS = 1499
|
||||||
|
|
||||||
|
# ROOT_IP is only used when running the bot without docker compose
|
||||||
|
ROOT_IP = 127.0.0.1
|
||||||
|
|
||||||
|
# PORT is only used when running the bot without docker compose
|
||||||
|
ROOT_PORT = 8000
|
||||||
|
|
||||||
|
# Directory to your models (llama.cpp specfic settings)
|
||||||
|
DATA_DIR = /Users/username/code/models
|
||||||
|
|
||||||
|
# Enable Expirmental Message Caches (Limited to single session)
|
||||||
|
# Cache will use ~1.4 GB or MORE of RAM. ONLY ENABLE IF YOUR SYSTEM CAN HANDLE THIS.
|
||||||
|
CACHE = 1
|
||||||
|
|
||||||
|
CACHE_TYPE = "disk"
|
||||||
|
|
||||||
|
# Set number of threads to use, currently, a standard thread will utilize 1 whole core
|
||||||
|
# I usually will set this between all cores I physcally have OR 2 cores less to allow for other processes.
|
||||||
|
N_THREADS = 10
|
||||||
|
|
||||||
|
# Always use MMAP unless you know what you are doing
|
||||||
|
#USE_MMAP=1
|
||||||
|
|
||||||
|
# Only use MLOCK if you know what it does!
|
||||||
|
USE_MLOCK=0
|
||||||
|
|
||||||
|
# The higher the number the more hard core.
|
||||||
|
REPEAT_PENALTY=1
|
||||||
|
|
||||||
|
# GPU SPECIFIC SETTINGS BELOW
|
||||||
|
|
||||||
|
GPU=0
|
||||||
|
|
||||||
|
N_GPU_LAYERS=32
|
||||||
|
|
||||||
|
PYTHONUNBUFFERED=1
|
@ -453,7 +453,7 @@ async function generateResponse(conversation, message) {
|
|||||||
let tokenLength = totalTokens
|
let tokenLength = totalTokens
|
||||||
|
|
||||||
// Remove older conversations if necessary
|
// Remove older conversations if necessary
|
||||||
const maxLength = 2000;
|
const maxLength = 1800;
|
||||||
if (tokenLength > maxLength) {
|
if (tokenLength > maxLength) {
|
||||||
const diff = tokenLength - maxLength;
|
const diff = tokenLength - maxLength;
|
||||||
let removedTokens = 0;
|
let removedTokens = 0;
|
||||||
@ -461,15 +461,17 @@ async function generateResponse(conversation, message) {
|
|||||||
// Iterate over the messages in reverse order
|
// Iterate over the messages in reverse order
|
||||||
for (let i = messagesCopy.length - 1; i >= 0; i--) {
|
for (let i = messagesCopy.length - 1; i >= 0; i--) {
|
||||||
const message = messagesCopy[i];
|
const message = messagesCopy[i];
|
||||||
const messageTokens = llamaTokenizer.encode([message]);
|
const messageTokens = countLlamaTokens(message);
|
||||||
|
|
||||||
// Calculate the token length of the current message
|
// Calculate the token length of the current message
|
||||||
const messageTokenLength = messageTokens.length;
|
const messageTokenLength = countLlamaTokens(messageTokens);
|
||||||
|
|
||||||
// Remove the current message if it won't make the token length negative
|
// Remove the current message if it won't make the token length negative
|
||||||
if (removedTokens + messageTokenLength <= diff) {
|
if (removedTokens + messageTokenLength <= diff) {
|
||||||
messagesCopy.splice(i, 1);
|
messagesCopy.splice(i, 1);
|
||||||
removedTokens += messageTokenLength;
|
removedTokens += messageTokenLength;
|
||||||
|
console.log(removedTokens + " removed \nAfter Resize: " + countLlamaTokens(messagesCopy)
|
||||||
|
)
|
||||||
} else {
|
} else {
|
||||||
// Break the loop if removing the message would make the token length negative
|
// Break the loop if removing the message would make the token length negative
|
||||||
break;
|
break;
|
||||||
@ -506,6 +508,7 @@ async function generateResponse(conversation, message) {
|
|||||||
if (time > 2) {
|
if (time > 2) {
|
||||||
await botMessage.delete()
|
await botMessage.delete()
|
||||||
clearInterval(refreshInterval);
|
clearInterval(refreshInterval);
|
||||||
|
clearTimeout(timeout);
|
||||||
botMessage = null;
|
botMessage = null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user