From 6efd069b5d918f847ec33426c51f959ed93f312a Mon Sep 17 00:00:00 2001 From: Raven Scott Date: Mon, 14 Aug 2023 23:00:23 -0400 Subject: [PATCH] counting tokens properly --- default.env | 53 ------------------------------------------ default.gpu.env | 58 ---------------------------------------------- docker-compose.yml | 2 +- llamabot.js | 36 +++++++++++++++++----------- package-lock.json | 22 +++++++++++++++++- package.json | 4 +++- server/Dockerfile | 2 +- 7 files changed, 49 insertions(+), 128 deletions(-) delete mode 100644 default.env delete mode 100644 default.gpu.env diff --git a/default.env b/default.env deleted file mode 100644 index 09ea689..0000000 --- a/default.env +++ /dev/null @@ -1,53 +0,0 @@ -# Discord Token -THE_TOKEN = "DISCORD_TOKEN_HERE" - -# The Channel IDs the bot will operate in seperated by commas -CHANNEL_IDS = 1094494101631680653,1094628334727614605 - -# The INIT prompt for all conversations. -INIT_PROMPT = "Assistant name: ChatAI. You code, write and provide any information without any mistakes." - -# Loading Emebed Refresh Timing -REFRESH_INTERVAL=10 - -# When a message is too large for discord we chunk the response into seperate messages. -# To ensure we do not rate limit the bot we send these at a delay interval. -# DEFAULT: 3 a good setting is between 3 and 7 seconds. -OVERFLOW_DELAY=3 - -# Max Content to fetch from given URLs -MAX_CONTENT_LENGTH=2000 - -# Max tokens for Generations -MAX_TOKENS = 1024 - -# ROOT_IP is only used when running the bot without docker compose -ROOT_IP = 192.168.0.15 - -# PORT is only used when running the bot without docker compose -ROOT_PORT = 8000 - -# Directory to your models (llama.cpp specfic settings) -DATA_DIR = /home/USERNAME/weights - -# Enable Expirmental Message Caches (Limited to single session) -# Cache will use ~1.4 GB or MORE of RAM. ONLY ENABLE IF YOUR SYSTEM CAN HANDLE THIS. -CACHE = 1 - -CACHE_TYPE = "disk" - -# Set number of threads to use, currently, a standard thread will utilize 1 whole core -# I usually will set this between all cores I physcally have OR 2 cores less to allow for other processes. -N_THREADS = 4 - -# Always use MMAP unless you know what you are doing -USE_MMAP=1 - -# Only use MLOCK if you know what it does! -USE_MLOCK=0 - -# The higher the number the more hard core. -REPEAT_PENALTY=1 - -# Do not change -GPU=0 \ No newline at end of file diff --git a/default.gpu.env b/default.gpu.env deleted file mode 100644 index 781aa31..0000000 --- a/default.gpu.env +++ /dev/null @@ -1,58 +0,0 @@ -# Discord Token -THE_TOKEN = "DISCORD_TOKEN_HERE" - -# The Channel IDs the bot will operate in seperated by commas -CHANNEL_IDS = 1094494101631680653,1094628334727614605 - -# The INIT prompt for all conversations. -INIT_PROMPT = "Assistant name: ChatAI. You code, write and provide any information without any mistakes." - -# Loading Emebed Refresh Timing -REFRESH_INTERVAL=10 - -# When a message is too large for discord we chunk the response into seperate messages. -# To ensure we do not rate limit the bot we send these at a delay interval. -# DEFAULT: 3 a good setting is between 3 and 7 seconds. -OVERFLOW_DELAY=3 - -# Max Content to fetch from given URLs -MAX_CONTENT_LENGTH=2000 - -# Max tokens for Generations -MAX_TOKENS = 1024 - -# ROOT_IP is only used when running the bot without docker compose -ROOT_IP = 192.168.0.15 - -# PORT is only used when running the bot without docker compose -ROOT_PORT = 8000 - -# Directory to your models (llama.cpp specfic settings) -DATA_DIR = /home/USERNAME/weights - -# Enable Expirmental Message Caches (Limited to single session) -# Cache will use ~1.4 GB or MORE of RAM. ONLY ENABLE IF YOUR SYSTEM CAN HANDLE THIS. -CACHE = 1 - -CACHE_TYPE = "disk" - -# Set number of threads to use, currently, a standard thread will utilize 1 whole core -# I usually will set this between all cores I physcally have OR 2 cores less to allow for other processes. -N_THREADS = 4 - -# Always use MMAP unless you know what you are doing -USE_MMAP=1 - -# Only use MLOCK if you know what it does! -USE_MLOCK=0 - -# The higher the number the more hard core. -REPEAT_PENALTY=1 - -# GPU SPECIFIC SETTINGS BELOW - -GPU=1 - -N_GPU_LAYERS=32 - -PYTHONUNBUFFERED=1 diff --git a/docker-compose.yml b/docker-compose.yml index 4c0815b..6499f3f 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -11,7 +11,7 @@ services: - ${DATA_DIR}:/usr/src/app/models environment: - HOST=llama-python-server - - MODEL=./models/ggml-vic7b-q4_0.bin + - MODEL=./models/vicuna-7b-1.1.ggmlv3.q6_K.bin llama-python-djs-bot: container_name: llama-python-djs-bot restart: unless-stopped diff --git a/llamabot.js b/llamabot.js index 533b496..a67550a 100644 --- a/llamabot.js +++ b/llamabot.js @@ -435,13 +435,22 @@ async function generateResponse(conversation, message) { // Grab the REFRESH_INTERVAL from ENV if not exist, lets use 7 (seconds) const refreshInterval = setInterval(showSystemLoad, (process.env.REFRESH_INTERVAL || 7) * 1000); - // Handle context size - // Encode the messages - const encodedTokens = llamaTokenizer.encode(messagesCopy); - - // Check the token length - const tokenLength = encodedTokens.length; - console.log(`CTX SIZE: ${tokenLength}`); + function countLlamaTokens(messages) { + let totalTokens = 0; + + for (const message of messages) { + if (message.role === 'user') { + const encodedTokens = llamaTokenizer.encode(message.content); + totalTokens += encodedTokens.length; + } + } + + return totalTokens; + } + + let totalTokens = countLlamaTokens(messagesCopy); + console.log(`Total Llama tokens: ${totalTokens}`); + let tokenLength = totalTokens // Remove older conversations if necessary const maxLength = 2048; @@ -467,11 +476,9 @@ async function generateResponse(conversation, message) { } } - // Check the updated token length + // Check the updated token length } - console.log(`CTX SIZE AFTER PROCESSING: ${llamaTokenizer.encode(messagesCopy).length}`); - // Sending request to our API const response = await fetch(`http://${process.env.ROOT_IP}:${process.env.ROOT_PORT}/v1/chat/completions`, { method: 'POST', @@ -496,9 +503,12 @@ async function generateResponse(conversation, message) { // clear the interval, replace the "please wait" message with the response, and update the message console.log(responseText); - await botMessage.delete() - clearInterval(refreshInterval); - botMessage = null; + if (time > 2) { + await botMessage.delete() + clearInterval(refreshInterval); + botMessage = null; + } + return responseText; } catch (err) { diff --git a/package-lock.json b/package-lock.json index 38e3d39..a5d9cf5 100644 --- a/package-lock.json +++ b/package-lock.json @@ -13,10 +13,12 @@ "cpu-stat": "^2.0.1", "discord.js": "^14.9.0", "dotenv": "^16.0.3", + "gpt-tokenizer": "^2.1.1", "llama-tokenizer-js": "^1.0.0", "node-fetch": "^3.3.1", "node-nvidia-smi": "^1.0.0", - "os": "^0.1.2" + "os": "^0.1.2", + "tiktoken": "^1.0.10" } }, "node_modules/@discordjs/builders": { @@ -405,6 +407,14 @@ "node": ">=12.20.0" } }, + "node_modules/gpt-tokenizer": { + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/gpt-tokenizer/-/gpt-tokenizer-2.1.1.tgz", + "integrity": "sha512-WlX+vj6aPaZ71U6Bf18fem+5k58zlgh2a4nbc7KHy6aGVIyq3nCh709b/8momu34sV/5t/SpzWi8LayWD9uyDw==", + "dependencies": { + "rfc4648": "^1.5.2" + } + }, "node_modules/htmlparser2": { "version": "8.0.2", "resolved": "https://registry.npmjs.org/htmlparser2/-/htmlparser2-8.0.2.tgz", @@ -587,6 +597,11 @@ "url": "https://github.com/sponsors/Borewit" } }, + "node_modules/rfc4648": { + "version": "1.5.2", + "resolved": "https://registry.npmjs.org/rfc4648/-/rfc4648-1.5.2.tgz", + "integrity": "sha512-tLOizhR6YGovrEBLatX1sdcuhoSCXddw3mqNVAcKxGJ+J0hFeJ+SjeWCv5UPA/WU3YzWPPuCVYgXBKZUPGpKtg==" + }, "node_modules/safe-buffer": { "version": "5.2.1", "resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.2.1.tgz", @@ -643,6 +658,11 @@ "url": "https://github.com/sponsors/Borewit" } }, + "node_modules/tiktoken": { + "version": "1.0.10", + "resolved": "https://registry.npmjs.org/tiktoken/-/tiktoken-1.0.10.tgz", + "integrity": "sha512-gF8ndTCNu7WcRFbl1UUWaFIB4CTXmHzS3tRYdyUYF7x3C6YR6Evoao4zhKDmWIwv2PzNbzoQMV8Pxt+17lEDbA==" + }, "node_modules/token-types": { "version": "5.0.1", "resolved": "https://registry.npmjs.org/token-types/-/token-types-5.0.1.tgz", diff --git a/package.json b/package.json index 5701e15..d6cd487 100644 --- a/package.json +++ b/package.json @@ -14,9 +14,11 @@ "cpu-stat": "^2.0.1", "discord.js": "^14.9.0", "dotenv": "^16.0.3", + "gpt-tokenizer": "^2.1.1", "llama-tokenizer-js": "^1.0.0", "node-fetch": "^3.3.1", "node-nvidia-smi": "^1.0.0", - "os": "^0.1.2" + "os": "^0.1.2", + "tiktoken": "^1.0.10" } } diff --git a/server/Dockerfile b/server/Dockerfile index 2720ea2..c63dce4 100644 --- a/server/Dockerfile +++ b/server/Dockerfile @@ -6,6 +6,6 @@ RUN apt-get update; \ WORKDIR /usr/src/app -RUN pip3 install llama-cpp-python[server] +RUN CMAKE_ARGS="-DLLAMA_METAL=on" FORCE_CMAKE=1 pip install llama-cpp-python[server] CMD python3 -m llama_cpp.server \ No newline at end of file