fixing up token reducing if sessions too large

2023-08-15 01:08:56 -04:00
parent f91d66b2b3
commit 05e79cba3a
2 changed files with 64 additions and 3 deletions
--- a/default.env
+++ b/default.env
@@ -0,0 +1,58 @@
+# Discord Token
+THE_TOKEN = ""
+
+# The Channel IDs the bot will operate in seperated by commas
+CHANNEL_IDS = 
+
+# The INIT prompt for all conversations.
+INIT_PROMPT = "Assistant name: ChatAI. You code, write and provide any information without any mistakes. You can read basic text from URLs if a user sends a user via scraping."
+
+# Loading Emebed Refresh Timing
+REFRESH_INTERVAL=2
+
+# When a message is too large for discord we chunk the response into seperate messages.
+# To ensure we do not rate limit the bot we send these at a delay interval.
+# DEFAULT: 3 a good setting is between 3 and 7 seconds.
+OVERFLOW_DELAY=3
+
+# Max Content to fetch from given URLs
+MAX_CONTENT_LENGTH=2000
+
+# Max tokens for Generations
+MAX_TOKENS = 1499
+
+# ROOT_IP is only used when running the bot without docker compose
+ROOT_IP = 127.0.0.1
+
+# PORT  is only used when running the bot without docker compose
+ROOT_PORT = 8000
+
+# Directory to your models (llama.cpp specfic settings)
+DATA_DIR = /Users/username/code/models
+
+# Enable Expirmental Message Caches (Limited to single session)
+# Cache will use ~1.4 GB or MORE of RAM. ONLY ENABLE IF YOUR SYSTEM CAN HANDLE THIS.
+CACHE = 1
+
+CACHE_TYPE = "disk"
+
+# Set number of threads to use, currently, a standard thread will utilize 1 whole core
+# I usually will set this between all cores I physcally have OR 2 cores less to allow for other processes.
+N_THREADS = 10
+
+# Always use MMAP unless you know what you are doing
+#USE_MMAP=1
+
+# Only use MLOCK if you know what it does!
+USE_MLOCK=0
+
+# The higher the number the more hard core.
+REPEAT_PENALTY=1
+
+# GPU SPECIFIC SETTINGS BELOW
+
+GPU=0
+
+N_GPU_LAYERS=32
+
+PYTHONUNBUFFERED=1
--- a/llamabot.js
+++ b/llamabot.js
@@ -453,7 +453,7 @@ async function generateResponse(conversation, message) {
        let tokenLength = totalTokens

        // Remove older conversations if necessary
-        const maxLength = 2000;
+        const maxLength = 1800;
        if (tokenLength > maxLength) {
            const diff = tokenLength - maxLength;
            let removedTokens = 0;
@@ -461,15 +461,17 @@ async function generateResponse(conversation, message) {
            // Iterate over the messages in reverse order
            for (let i = messagesCopy.length - 1; i >= 0; i--) {
                const message = messagesCopy[i];
-                const messageTokens = llamaTokenizer.encode([message]);
+                const messageTokens = countLlamaTokens(message);

                // Calculate the token length of the current message
-                const messageTokenLength = messageTokens.length;
+                const messageTokenLength = countLlamaTokens(messageTokens);

                // Remove the current message if it won't make the token length negative
                if (removedTokens + messageTokenLength <= diff) {
                    messagesCopy.splice(i, 1);
                    removedTokens += messageTokenLength;
+                    console.log(removedTokens + " removed \nAfter Resize: " + countLlamaTokens(messagesCopy)
+                    )
                } else {
                    // Break the loop if removing the message would make the token length negative
                    break;
@@ -506,6 +508,7 @@ async function generateResponse(conversation, message) {
        if (time > 2) {
            await botMessage.delete()
            clearInterval(refreshInterval);
+            clearTimeout(timeout);
            botMessage = null;
        }