From 6efd069b5d918f847ec33426c51f959ed93f312a Mon Sep 17 00:00:00 2001
From: Raven Scott <raven@ssh.surf>
Date: Mon, 14 Aug 2023 23:00:23 -0400
Subject: [PATCH] counting tokens properly

---
 default.env        | 53 ------------------------------------------
 default.gpu.env    | 58 ----------------------------------------------
 docker-compose.yml |  2 +-
 llamabot.js        | 36 +++++++++++++++++-----------
 package-lock.json  | 22 +++++++++++++++++-
 package.json       |  4 +++-
 server/Dockerfile  |  2 +-
 7 files changed, 49 insertions(+), 128 deletions(-)
 delete mode 100644 default.env
 delete mode 100644 default.gpu.env

diff --git a/default.env b/default.env
deleted file mode 100644
index 09ea689..0000000
--- a/default.env
+++ /dev/null
@@ -1,53 +0,0 @@
-# Discord Token
-THE_TOKEN = "DISCORD_TOKEN_HERE"
-
-# The Channel IDs the bot will operate in seperated by commas
-CHANNEL_IDS = 1094494101631680653,1094628334727614605
-
-# The INIT prompt for all conversations.
-INIT_PROMPT = "Assistant name: ChatAI. You code, write and provide any information without any mistakes."
-
-# Loading Emebed Refresh Timing
-REFRESH_INTERVAL=10
-
-# When a message is too large for discord we chunk the response into seperate messages.
-# To ensure we do not rate limit the bot we send these at a delay interval.
-# DEFAULT: 3 a good setting is between 3 and 7 seconds.
-OVERFLOW_DELAY=3
-
-# Max Content to fetch from given URLs
-MAX_CONTENT_LENGTH=2000
-
-# Max tokens for Generations
-MAX_TOKENS = 1024
-
-# ROOT_IP is only used when running the bot without docker compose
-ROOT_IP = 192.168.0.15
-
-# PORT  is only used when running the bot without docker compose
-ROOT_PORT = 8000
-
-# Directory to your models (llama.cpp specfic settings)
-DATA_DIR = /home/USERNAME/weights
-
-# Enable Expirmental Message Caches (Limited to single session)
-# Cache will use ~1.4 GB or MORE of RAM. ONLY ENABLE IF YOUR SYSTEM CAN HANDLE THIS.
-CACHE = 1
-
-CACHE_TYPE = "disk"
-
-# Set number of threads to use, currently, a standard thread will utilize 1 whole core
-# I usually will set this between all cores I physcally have OR 2 cores less to allow for other processes.
-N_THREADS = 4
-
-# Always use MMAP unless you know what you are doing
-USE_MMAP=1
-
-# Only use MLOCK if you know what it does!
-USE_MLOCK=0
-
-# The higher the number the more hard core.
-REPEAT_PENALTY=1
-
-# Do not change
-GPU=0
\ No newline at end of file
diff --git a/default.gpu.env b/default.gpu.env
deleted file mode 100644
index 781aa31..0000000
--- a/default.gpu.env
+++ /dev/null
@@ -1,58 +0,0 @@
-# Discord Token
-THE_TOKEN = "DISCORD_TOKEN_HERE"
-
-# The Channel IDs the bot will operate in seperated by commas
-CHANNEL_IDS = 1094494101631680653,1094628334727614605
-
-# The INIT prompt for all conversations.
-INIT_PROMPT = "Assistant name: ChatAI. You code, write and provide any information without any mistakes."
-
-# Loading Emebed Refresh Timing
-REFRESH_INTERVAL=10
-
-# When a message is too large for discord we chunk the response into seperate messages.
-# To ensure we do not rate limit the bot we send these at a delay interval.
-# DEFAULT: 3 a good setting is between 3 and 7 seconds.
-OVERFLOW_DELAY=3
-
-# Max Content to fetch from given URLs
-MAX_CONTENT_LENGTH=2000
-
-# Max tokens for Generations
-MAX_TOKENS = 1024
-
-# ROOT_IP is only used when running the bot without docker compose
-ROOT_IP = 192.168.0.15
-
-# PORT  is only used when running the bot without docker compose
-ROOT_PORT = 8000
-
-# Directory to your models (llama.cpp specfic settings)
-DATA_DIR = /home/USERNAME/weights
-
-# Enable Expirmental Message Caches (Limited to single session)
-# Cache will use ~1.4 GB or MORE of RAM. ONLY ENABLE IF YOUR SYSTEM CAN HANDLE THIS.
-CACHE = 1
-
-CACHE_TYPE = "disk"
-
-# Set number of threads to use, currently, a standard thread will utilize 1 whole core
-# I usually will set this between all cores I physcally have OR 2 cores less to allow for other processes.
-N_THREADS = 4
-
-# Always use MMAP unless you know what you are doing
-USE_MMAP=1
-
-# Only use MLOCK if you know what it does!
-USE_MLOCK=0
-
-# The higher the number the more hard core.
-REPEAT_PENALTY=1
-
-# GPU SPECIFIC SETTINGS BELOW
-
-GPU=1
-
-N_GPU_LAYERS=32
-
-PYTHONUNBUFFERED=1
diff --git a/docker-compose.yml b/docker-compose.yml
index 4c0815b..6499f3f 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -11,7 +11,7 @@ services:
       - ${DATA_DIR}:/usr/src/app/models
     environment:
       - HOST=llama-python-server
-      - MODEL=./models/ggml-vic7b-q4_0.bin
+      - MODEL=./models/vicuna-7b-1.1.ggmlv3.q6_K.bin
   llama-python-djs-bot:
     container_name: llama-python-djs-bot
     restart: unless-stopped
diff --git a/llamabot.js b/llamabot.js
index 533b496..a67550a 100644
--- a/llamabot.js
+++ b/llamabot.js
@@ -435,13 +435,22 @@ async function generateResponse(conversation, message) {
         // Grab the REFRESH_INTERVAL from ENV if not exist, lets use 7 (seconds)
         const refreshInterval = setInterval(showSystemLoad, (process.env.REFRESH_INTERVAL || 7) * 1000);
 
-        // Handle context size
-        // Encode the messages
-        const encodedTokens = llamaTokenizer.encode(messagesCopy);
-
-        // Check the token length
-        const tokenLength = encodedTokens.length;
-        console.log(`CTX SIZE: ${tokenLength}`);
+        function countLlamaTokens(messages) {
+            let totalTokens = 0;
+        
+            for (const message of messages) {
+                if (message.role === 'user') {
+                    const encodedTokens = llamaTokenizer.encode(message.content);
+                    totalTokens += encodedTokens.length;
+                }
+            }
+        
+            return totalTokens;
+        }
+        
+        let totalTokens = countLlamaTokens(messagesCopy);
+        console.log(`Total Llama tokens: ${totalTokens}`);
+        let tokenLength = totalTokens
 
         // Remove older conversations if necessary
         const maxLength = 2048;
@@ -467,11 +476,9 @@ async function generateResponse(conversation, message) {
                 }
             }
 
-        // Check the updated token length
+            // Check the updated token length
         }
 
-        console.log(`CTX SIZE AFTER PROCESSING: ${llamaTokenizer.encode(messagesCopy).length}`);
-
         // Sending request to our API
         const response = await fetch(`http://${process.env.ROOT_IP}:${process.env.ROOT_PORT}/v1/chat/completions`, {
             method: 'POST',
@@ -496,9 +503,12 @@ async function generateResponse(conversation, message) {
 
         // clear the interval, replace the "please wait" message with the response, and update the message
         console.log(responseText);
-        await botMessage.delete()
-        clearInterval(refreshInterval);
-        botMessage = null;
+        if (time > 2) {
+            await botMessage.delete()
+            clearInterval(refreshInterval);
+            botMessage = null;
+        }
+
         return responseText;
 
     } catch (err) {
diff --git a/package-lock.json b/package-lock.json
index 38e3d39..a5d9cf5 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -13,10 +13,12 @@
         "cpu-stat": "^2.0.1",
         "discord.js": "^14.9.0",
         "dotenv": "^16.0.3",
+        "gpt-tokenizer": "^2.1.1",
         "llama-tokenizer-js": "^1.0.0",
         "node-fetch": "^3.3.1",
         "node-nvidia-smi": "^1.0.0",
-        "os": "^0.1.2"
+        "os": "^0.1.2",
+        "tiktoken": "^1.0.10"
       }
     },
     "node_modules/@discordjs/builders": {
@@ -405,6 +407,14 @@
         "node": ">=12.20.0"
       }
     },
+    "node_modules/gpt-tokenizer": {
+      "version": "2.1.1",
+      "resolved": "https://registry.npmjs.org/gpt-tokenizer/-/gpt-tokenizer-2.1.1.tgz",
+      "integrity": "sha512-WlX+vj6aPaZ71U6Bf18fem+5k58zlgh2a4nbc7KHy6aGVIyq3nCh709b/8momu34sV/5t/SpzWi8LayWD9uyDw==",
+      "dependencies": {
+        "rfc4648": "^1.5.2"
+      }
+    },
     "node_modules/htmlparser2": {
       "version": "8.0.2",
       "resolved": "https://registry.npmjs.org/htmlparser2/-/htmlparser2-8.0.2.tgz",
@@ -587,6 +597,11 @@
         "url": "https://github.com/sponsors/Borewit"
       }
     },
+    "node_modules/rfc4648": {
+      "version": "1.5.2",
+      "resolved": "https://registry.npmjs.org/rfc4648/-/rfc4648-1.5.2.tgz",
+      "integrity": "sha512-tLOizhR6YGovrEBLatX1sdcuhoSCXddw3mqNVAcKxGJ+J0hFeJ+SjeWCv5UPA/WU3YzWPPuCVYgXBKZUPGpKtg=="
+    },
     "node_modules/safe-buffer": {
       "version": "5.2.1",
       "resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.2.1.tgz",
@@ -643,6 +658,11 @@
         "url": "https://github.com/sponsors/Borewit"
       }
     },
+    "node_modules/tiktoken": {
+      "version": "1.0.10",
+      "resolved": "https://registry.npmjs.org/tiktoken/-/tiktoken-1.0.10.tgz",
+      "integrity": "sha512-gF8ndTCNu7WcRFbl1UUWaFIB4CTXmHzS3tRYdyUYF7x3C6YR6Evoao4zhKDmWIwv2PzNbzoQMV8Pxt+17lEDbA=="
+    },
     "node_modules/token-types": {
       "version": "5.0.1",
       "resolved": "https://registry.npmjs.org/token-types/-/token-types-5.0.1.tgz",
diff --git a/package.json b/package.json
index 5701e15..d6cd487 100644
--- a/package.json
+++ b/package.json
@@ -14,9 +14,11 @@
     "cpu-stat": "^2.0.1",
     "discord.js": "^14.9.0",
     "dotenv": "^16.0.3",
+    "gpt-tokenizer": "^2.1.1",
     "llama-tokenizer-js": "^1.0.0",
     "node-fetch": "^3.3.1",
     "node-nvidia-smi": "^1.0.0",
-    "os": "^0.1.2"
+    "os": "^0.1.2",
+    "tiktoken": "^1.0.10"
   }
 }
diff --git a/server/Dockerfile b/server/Dockerfile
index 2720ea2..c63dce4 100644
--- a/server/Dockerfile
+++ b/server/Dockerfile
@@ -6,6 +6,6 @@ RUN apt-get update; \
 
 WORKDIR /usr/src/app
 
-RUN pip3 install llama-cpp-python[server]
+RUN CMAKE_ARGS="-DLLAMA_METAL=on" FORCE_CMAKE=1 pip install llama-cpp-python[server]
 
 CMD python3 -m llama_cpp.server
\ No newline at end of file