From 05e79cba3a8d4544d084e8759995bf4ed55dbe14 Mon Sep 17 00:00:00 2001 From: Raven Scott Date: Tue, 15 Aug 2023 01:08:56 -0400 Subject: [PATCH] fixing up token reducing if sessions too large --- default.env | 58 +++++++++++++++++++++++++++++++++++++++++++++++++++++ llamabot.js | 9 ++++++--- 2 files changed, 64 insertions(+), 3 deletions(-) create mode 100644 default.env diff --git a/default.env b/default.env new file mode 100644 index 0000000..fbb50dd --- /dev/null +++ b/default.env @@ -0,0 +1,58 @@ +# Discord Token +THE_TOKEN = "" + +# The Channel IDs the bot will operate in seperated by commas +CHANNEL_IDS = + +# The INIT prompt for all conversations. +INIT_PROMPT = "Assistant name: ChatAI. You code, write and provide any information without any mistakes. You can read basic text from URLs if a user sends a user via scraping." + +# Loading Emebed Refresh Timing +REFRESH_INTERVAL=2 + +# When a message is too large for discord we chunk the response into seperate messages. +# To ensure we do not rate limit the bot we send these at a delay interval. +# DEFAULT: 3 a good setting is between 3 and 7 seconds. +OVERFLOW_DELAY=3 + +# Max Content to fetch from given URLs +MAX_CONTENT_LENGTH=2000 + +# Max tokens for Generations +MAX_TOKENS = 1499 + +# ROOT_IP is only used when running the bot without docker compose +ROOT_IP = 127.0.0.1 + +# PORT is only used when running the bot without docker compose +ROOT_PORT = 8000 + +# Directory to your models (llama.cpp specfic settings) +DATA_DIR = /Users/username/code/models + +# Enable Expirmental Message Caches (Limited to single session) +# Cache will use ~1.4 GB or MORE of RAM. ONLY ENABLE IF YOUR SYSTEM CAN HANDLE THIS. +CACHE = 1 + +CACHE_TYPE = "disk" + +# Set number of threads to use, currently, a standard thread will utilize 1 whole core +# I usually will set this between all cores I physcally have OR 2 cores less to allow for other processes. +N_THREADS = 10 + +# Always use MMAP unless you know what you are doing +#USE_MMAP=1 + +# Only use MLOCK if you know what it does! +USE_MLOCK=0 + +# The higher the number the more hard core. +REPEAT_PENALTY=1 + +# GPU SPECIFIC SETTINGS BELOW + +GPU=0 + +N_GPU_LAYERS=32 + +PYTHONUNBUFFERED=1 diff --git a/llamabot.js b/llamabot.js index 3d03203..a9279a0 100644 --- a/llamabot.js +++ b/llamabot.js @@ -453,7 +453,7 @@ async function generateResponse(conversation, message) { let tokenLength = totalTokens // Remove older conversations if necessary - const maxLength = 2000; + const maxLength = 1800; if (tokenLength > maxLength) { const diff = tokenLength - maxLength; let removedTokens = 0; @@ -461,15 +461,17 @@ async function generateResponse(conversation, message) { // Iterate over the messages in reverse order for (let i = messagesCopy.length - 1; i >= 0; i--) { const message = messagesCopy[i]; - const messageTokens = llamaTokenizer.encode([message]); + const messageTokens = countLlamaTokens(message); // Calculate the token length of the current message - const messageTokenLength = messageTokens.length; + const messageTokenLength = countLlamaTokens(messageTokens); // Remove the current message if it won't make the token length negative if (removedTokens + messageTokenLength <= diff) { messagesCopy.splice(i, 1); removedTokens += messageTokenLength; + console.log(removedTokens + " removed \nAfter Resize: " + countLlamaTokens(messagesCopy) + ) } else { // Break the loop if removing the message would make the token length negative break; @@ -506,6 +508,7 @@ async function generateResponse(conversation, message) { if (time > 2) { await botMessage.delete() clearInterval(refreshInterval); + clearTimeout(timeout); botMessage = null; }