counting tokens properly

This commit is contained in:
Raven Scott 2023-08-14 23:00:23 -04:00
parent bd435ca311
commit 6efd069b5d
7 changed files with 49 additions and 128 deletions

View File

@ -1,53 +0,0 @@
# Discord Token
THE_TOKEN = "DISCORD_TOKEN_HERE"
# The Channel IDs the bot will operate in seperated by commas
CHANNEL_IDS = 1094494101631680653,1094628334727614605
# The INIT prompt for all conversations.
INIT_PROMPT = "Assistant name: ChatAI. You code, write and provide any information without any mistakes."
# Loading Emebed Refresh Timing
REFRESH_INTERVAL=10
# When a message is too large for discord we chunk the response into seperate messages.
# To ensure we do not rate limit the bot we send these at a delay interval.
# DEFAULT: 3 a good setting is between 3 and 7 seconds.
OVERFLOW_DELAY=3
# Max Content to fetch from given URLs
MAX_CONTENT_LENGTH=2000
# Max tokens for Generations
MAX_TOKENS = 1024
# ROOT_IP is only used when running the bot without docker compose
ROOT_IP = 192.168.0.15
# PORT is only used when running the bot without docker compose
ROOT_PORT = 8000
# Directory to your models (llama.cpp specfic settings)
DATA_DIR = /home/USERNAME/weights
# Enable Expirmental Message Caches (Limited to single session)
# Cache will use ~1.4 GB or MORE of RAM. ONLY ENABLE IF YOUR SYSTEM CAN HANDLE THIS.
CACHE = 1
CACHE_TYPE = "disk"
# Set number of threads to use, currently, a standard thread will utilize 1 whole core
# I usually will set this between all cores I physcally have OR 2 cores less to allow for other processes.
N_THREADS = 4
# Always use MMAP unless you know what you are doing
USE_MMAP=1
# Only use MLOCK if you know what it does!
USE_MLOCK=0
# The higher the number the more hard core.
REPEAT_PENALTY=1
# Do not change
GPU=0

View File

@ -1,58 +0,0 @@
# Discord Token
THE_TOKEN = "DISCORD_TOKEN_HERE"
# The Channel IDs the bot will operate in seperated by commas
CHANNEL_IDS = 1094494101631680653,1094628334727614605
# The INIT prompt for all conversations.
INIT_PROMPT = "Assistant name: ChatAI. You code, write and provide any information without any mistakes."
# Loading Emebed Refresh Timing
REFRESH_INTERVAL=10
# When a message is too large for discord we chunk the response into seperate messages.
# To ensure we do not rate limit the bot we send these at a delay interval.
# DEFAULT: 3 a good setting is between 3 and 7 seconds.
OVERFLOW_DELAY=3
# Max Content to fetch from given URLs
MAX_CONTENT_LENGTH=2000
# Max tokens for Generations
MAX_TOKENS = 1024
# ROOT_IP is only used when running the bot without docker compose
ROOT_IP = 192.168.0.15
# PORT is only used when running the bot without docker compose
ROOT_PORT = 8000
# Directory to your models (llama.cpp specfic settings)
DATA_DIR = /home/USERNAME/weights
# Enable Expirmental Message Caches (Limited to single session)
# Cache will use ~1.4 GB or MORE of RAM. ONLY ENABLE IF YOUR SYSTEM CAN HANDLE THIS.
CACHE = 1
CACHE_TYPE = "disk"
# Set number of threads to use, currently, a standard thread will utilize 1 whole core
# I usually will set this between all cores I physcally have OR 2 cores less to allow for other processes.
N_THREADS = 4
# Always use MMAP unless you know what you are doing
USE_MMAP=1
# Only use MLOCK if you know what it does!
USE_MLOCK=0
# The higher the number the more hard core.
REPEAT_PENALTY=1
# GPU SPECIFIC SETTINGS BELOW
GPU=1
N_GPU_LAYERS=32
PYTHONUNBUFFERED=1

View File

@ -11,7 +11,7 @@ services:
- ${DATA_DIR}:/usr/src/app/models - ${DATA_DIR}:/usr/src/app/models
environment: environment:
- HOST=llama-python-server - HOST=llama-python-server
- MODEL=./models/ggml-vic7b-q4_0.bin - MODEL=./models/vicuna-7b-1.1.ggmlv3.q6_K.bin
llama-python-djs-bot: llama-python-djs-bot:
container_name: llama-python-djs-bot container_name: llama-python-djs-bot
restart: unless-stopped restart: unless-stopped

View File

@ -435,13 +435,22 @@ async function generateResponse(conversation, message) {
// Grab the REFRESH_INTERVAL from ENV if not exist, lets use 7 (seconds) // Grab the REFRESH_INTERVAL from ENV if not exist, lets use 7 (seconds)
const refreshInterval = setInterval(showSystemLoad, (process.env.REFRESH_INTERVAL || 7) * 1000); const refreshInterval = setInterval(showSystemLoad, (process.env.REFRESH_INTERVAL || 7) * 1000);
// Handle context size function countLlamaTokens(messages) {
// Encode the messages let totalTokens = 0;
const encodedTokens = llamaTokenizer.encode(messagesCopy);
// Check the token length for (const message of messages) {
const tokenLength = encodedTokens.length; if (message.role === 'user') {
console.log(`CTX SIZE: ${tokenLength}`); const encodedTokens = llamaTokenizer.encode(message.content);
totalTokens += encodedTokens.length;
}
}
return totalTokens;
}
let totalTokens = countLlamaTokens(messagesCopy);
console.log(`Total Llama tokens: ${totalTokens}`);
let tokenLength = totalTokens
// Remove older conversations if necessary // Remove older conversations if necessary
const maxLength = 2048; const maxLength = 2048;
@ -467,11 +476,9 @@ async function generateResponse(conversation, message) {
} }
} }
// Check the updated token length // Check the updated token length
} }
console.log(`CTX SIZE AFTER PROCESSING: ${llamaTokenizer.encode(messagesCopy).length}`);
// Sending request to our API // Sending request to our API
const response = await fetch(`http://${process.env.ROOT_IP}:${process.env.ROOT_PORT}/v1/chat/completions`, { const response = await fetch(`http://${process.env.ROOT_IP}:${process.env.ROOT_PORT}/v1/chat/completions`, {
method: 'POST', method: 'POST',
@ -496,9 +503,12 @@ async function generateResponse(conversation, message) {
// clear the interval, replace the "please wait" message with the response, and update the message // clear the interval, replace the "please wait" message with the response, and update the message
console.log(responseText); console.log(responseText);
await botMessage.delete() if (time > 2) {
clearInterval(refreshInterval); await botMessage.delete()
botMessage = null; clearInterval(refreshInterval);
botMessage = null;
}
return responseText; return responseText;
} catch (err) { } catch (err) {

22
package-lock.json generated
View File

@ -13,10 +13,12 @@
"cpu-stat": "^2.0.1", "cpu-stat": "^2.0.1",
"discord.js": "^14.9.0", "discord.js": "^14.9.0",
"dotenv": "^16.0.3", "dotenv": "^16.0.3",
"gpt-tokenizer": "^2.1.1",
"llama-tokenizer-js": "^1.0.0", "llama-tokenizer-js": "^1.0.0",
"node-fetch": "^3.3.1", "node-fetch": "^3.3.1",
"node-nvidia-smi": "^1.0.0", "node-nvidia-smi": "^1.0.0",
"os": "^0.1.2" "os": "^0.1.2",
"tiktoken": "^1.0.10"
} }
}, },
"node_modules/@discordjs/builders": { "node_modules/@discordjs/builders": {
@ -405,6 +407,14 @@
"node": ">=12.20.0" "node": ">=12.20.0"
} }
}, },
"node_modules/gpt-tokenizer": {
"version": "2.1.1",
"resolved": "https://registry.npmjs.org/gpt-tokenizer/-/gpt-tokenizer-2.1.1.tgz",
"integrity": "sha512-WlX+vj6aPaZ71U6Bf18fem+5k58zlgh2a4nbc7KHy6aGVIyq3nCh709b/8momu34sV/5t/SpzWi8LayWD9uyDw==",
"dependencies": {
"rfc4648": "^1.5.2"
}
},
"node_modules/htmlparser2": { "node_modules/htmlparser2": {
"version": "8.0.2", "version": "8.0.2",
"resolved": "https://registry.npmjs.org/htmlparser2/-/htmlparser2-8.0.2.tgz", "resolved": "https://registry.npmjs.org/htmlparser2/-/htmlparser2-8.0.2.tgz",
@ -587,6 +597,11 @@
"url": "https://github.com/sponsors/Borewit" "url": "https://github.com/sponsors/Borewit"
} }
}, },
"node_modules/rfc4648": {
"version": "1.5.2",
"resolved": "https://registry.npmjs.org/rfc4648/-/rfc4648-1.5.2.tgz",
"integrity": "sha512-tLOizhR6YGovrEBLatX1sdcuhoSCXddw3mqNVAcKxGJ+J0hFeJ+SjeWCv5UPA/WU3YzWPPuCVYgXBKZUPGpKtg=="
},
"node_modules/safe-buffer": { "node_modules/safe-buffer": {
"version": "5.2.1", "version": "5.2.1",
"resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.2.1.tgz", "resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.2.1.tgz",
@ -643,6 +658,11 @@
"url": "https://github.com/sponsors/Borewit" "url": "https://github.com/sponsors/Borewit"
} }
}, },
"node_modules/tiktoken": {
"version": "1.0.10",
"resolved": "https://registry.npmjs.org/tiktoken/-/tiktoken-1.0.10.tgz",
"integrity": "sha512-gF8ndTCNu7WcRFbl1UUWaFIB4CTXmHzS3tRYdyUYF7x3C6YR6Evoao4zhKDmWIwv2PzNbzoQMV8Pxt+17lEDbA=="
},
"node_modules/token-types": { "node_modules/token-types": {
"version": "5.0.1", "version": "5.0.1",
"resolved": "https://registry.npmjs.org/token-types/-/token-types-5.0.1.tgz", "resolved": "https://registry.npmjs.org/token-types/-/token-types-5.0.1.tgz",

View File

@ -14,9 +14,11 @@
"cpu-stat": "^2.0.1", "cpu-stat": "^2.0.1",
"discord.js": "^14.9.0", "discord.js": "^14.9.0",
"dotenv": "^16.0.3", "dotenv": "^16.0.3",
"gpt-tokenizer": "^2.1.1",
"llama-tokenizer-js": "^1.0.0", "llama-tokenizer-js": "^1.0.0",
"node-fetch": "^3.3.1", "node-fetch": "^3.3.1",
"node-nvidia-smi": "^1.0.0", "node-nvidia-smi": "^1.0.0",
"os": "^0.1.2" "os": "^0.1.2",
"tiktoken": "^1.0.10"
} }
} }

View File

@ -6,6 +6,6 @@ RUN apt-get update; \
WORKDIR /usr/src/app WORKDIR /usr/src/app
RUN pip3 install llama-cpp-python[server] RUN CMAKE_ARGS="-DLLAMA_METAL=on" FORCE_CMAKE=1 pip install llama-cpp-python[server]
CMD python3 -m llama_cpp.server CMD python3 -m llama_cpp.server