fixing up token reducing if sessions too large

2023-08-15 01:08:56 -04:00
parent f91d66b2b3
commit 05e79cba3a
2 changed files with 64 additions and 3 deletions
--- a/default.env
+++ b/default.env
@ -0,0 +1,58 @@
 # Discord Token
 THE_TOKEN = ""
 # The Channel IDs the bot will operate in seperated by commas
 CHANNEL_IDS = 
 # The INIT prompt for all conversations.
 INIT_PROMPT = "Assistant name: ChatAI. You code, write and provide any information without any mistakes. You can read basic text from URLs if a user sends a user via scraping."
 # Loading Emebed Refresh Timing
 REFRESH_INTERVAL=2
 # When a message is too large for discord we chunk the response into seperate messages.
 # To ensure we do not rate limit the bot we send these at a delay interval.
 # DEFAULT: 3 a good setting is between 3 and 7 seconds.
 OVERFLOW_DELAY=3
 # Max Content to fetch from given URLs
 MAX_CONTENT_LENGTH=2000
 # Max tokens for Generations
 MAX_TOKENS = 1499
 # ROOT_IP is only used when running the bot without docker compose
 ROOT_IP = 127.0.0.1
 # PORT  is only used when running the bot without docker compose
 ROOT_PORT = 8000
 # Directory to your models (llama.cpp specfic settings)
 DATA_DIR = /Users/username/code/models
 # Enable Expirmental Message Caches (Limited to single session)
 # Cache will use ~1.4 GB or MORE of RAM. ONLY ENABLE IF YOUR SYSTEM CAN HANDLE THIS.
 CACHE = 1
 CACHE_TYPE = "disk"
 # Set number of threads to use, currently, a standard thread will utilize 1 whole core
 # I usually will set this between all cores I physcally have OR 2 cores less to allow for other processes.
 N_THREADS = 10
 # Always use MMAP unless you know what you are doing
 #USE_MMAP=1
 # Only use MLOCK if you know what it does!
 USE_MLOCK=0
 # The higher the number the more hard core.
 REPEAT_PENALTY=1
 # GPU SPECIFIC SETTINGS BELOW
 GPU=0
 N_GPU_LAYERS=32
 PYTHONUNBUFFERED=1
--- a/llamabot.js
+++ b/llamabot.js
@ -453,7 +453,7 @@ async function generateResponse(conversation, message) {
        let tokenLength = totalTokens
        // Remove older conversations if necessary
-        const maxLength = 2000;
+        const maxLength = 1800;
        if (tokenLength > maxLength) {
            const diff = tokenLength - maxLength;
            let removedTokens = 0;
@ -461,15 +461,17 @@ async function generateResponse(conversation, message) {
            // Iterate over the messages in reverse order
            for (let i = messagesCopy.length - 1; i >= 0; i--) {
                const message = messagesCopy[i];
-                const messageTokens = llamaTokenizer.encode([message]);
+                const messageTokens = countLlamaTokens(message);
                // Calculate the token length of the current message
-                const messageTokenLength = messageTokens.length;
+                const messageTokenLength = countLlamaTokens(messageTokens);
                // Remove the current message if it won't make the token length negative
                if (removedTokens + messageTokenLength <= diff) {
                    messagesCopy.splice(i, 1);
                    removedTokens += messageTokenLength;
                    console.log(removedTokens + " removed \nAfter Resize: " + countLlamaTokens(messagesCopy)
                    )
                } else {
                    // Break the loop if removing the message would make the token length negative
                    break;
@ -506,6 +508,7 @@ async function generateResponse(conversation, message) {
        if (time > 2) {
            await botMessage.delete()
            clearInterval(refreshInterval);
            clearTimeout(timeout);
            botMessage = null;
        }