counting tokens properly
This commit is contained in:
parent
bd435ca311
commit
6efd069b5d
53
default.env
53
default.env
@ -1,53 +0,0 @@
|
|||||||
# Discord Token
|
|
||||||
THE_TOKEN = "DISCORD_TOKEN_HERE"
|
|
||||||
|
|
||||||
# The Channel IDs the bot will operate in seperated by commas
|
|
||||||
CHANNEL_IDS = 1094494101631680653,1094628334727614605
|
|
||||||
|
|
||||||
# The INIT prompt for all conversations.
|
|
||||||
INIT_PROMPT = "Assistant name: ChatAI. You code, write and provide any information without any mistakes."
|
|
||||||
|
|
||||||
# Loading Emebed Refresh Timing
|
|
||||||
REFRESH_INTERVAL=10
|
|
||||||
|
|
||||||
# When a message is too large for discord we chunk the response into seperate messages.
|
|
||||||
# To ensure we do not rate limit the bot we send these at a delay interval.
|
|
||||||
# DEFAULT: 3 a good setting is between 3 and 7 seconds.
|
|
||||||
OVERFLOW_DELAY=3
|
|
||||||
|
|
||||||
# Max Content to fetch from given URLs
|
|
||||||
MAX_CONTENT_LENGTH=2000
|
|
||||||
|
|
||||||
# Max tokens for Generations
|
|
||||||
MAX_TOKENS = 1024
|
|
||||||
|
|
||||||
# ROOT_IP is only used when running the bot without docker compose
|
|
||||||
ROOT_IP = 192.168.0.15
|
|
||||||
|
|
||||||
# PORT is only used when running the bot without docker compose
|
|
||||||
ROOT_PORT = 8000
|
|
||||||
|
|
||||||
# Directory to your models (llama.cpp specfic settings)
|
|
||||||
DATA_DIR = /home/USERNAME/weights
|
|
||||||
|
|
||||||
# Enable Expirmental Message Caches (Limited to single session)
|
|
||||||
# Cache will use ~1.4 GB or MORE of RAM. ONLY ENABLE IF YOUR SYSTEM CAN HANDLE THIS.
|
|
||||||
CACHE = 1
|
|
||||||
|
|
||||||
CACHE_TYPE = "disk"
|
|
||||||
|
|
||||||
# Set number of threads to use, currently, a standard thread will utilize 1 whole core
|
|
||||||
# I usually will set this between all cores I physcally have OR 2 cores less to allow for other processes.
|
|
||||||
N_THREADS = 4
|
|
||||||
|
|
||||||
# Always use MMAP unless you know what you are doing
|
|
||||||
USE_MMAP=1
|
|
||||||
|
|
||||||
# Only use MLOCK if you know what it does!
|
|
||||||
USE_MLOCK=0
|
|
||||||
|
|
||||||
# The higher the number the more hard core.
|
|
||||||
REPEAT_PENALTY=1
|
|
||||||
|
|
||||||
# Do not change
|
|
||||||
GPU=0
|
|
@ -1,58 +0,0 @@
|
|||||||
# Discord Token
|
|
||||||
THE_TOKEN = "DISCORD_TOKEN_HERE"
|
|
||||||
|
|
||||||
# The Channel IDs the bot will operate in seperated by commas
|
|
||||||
CHANNEL_IDS = 1094494101631680653,1094628334727614605
|
|
||||||
|
|
||||||
# The INIT prompt for all conversations.
|
|
||||||
INIT_PROMPT = "Assistant name: ChatAI. You code, write and provide any information without any mistakes."
|
|
||||||
|
|
||||||
# Loading Emebed Refresh Timing
|
|
||||||
REFRESH_INTERVAL=10
|
|
||||||
|
|
||||||
# When a message is too large for discord we chunk the response into seperate messages.
|
|
||||||
# To ensure we do not rate limit the bot we send these at a delay interval.
|
|
||||||
# DEFAULT: 3 a good setting is between 3 and 7 seconds.
|
|
||||||
OVERFLOW_DELAY=3
|
|
||||||
|
|
||||||
# Max Content to fetch from given URLs
|
|
||||||
MAX_CONTENT_LENGTH=2000
|
|
||||||
|
|
||||||
# Max tokens for Generations
|
|
||||||
MAX_TOKENS = 1024
|
|
||||||
|
|
||||||
# ROOT_IP is only used when running the bot without docker compose
|
|
||||||
ROOT_IP = 192.168.0.15
|
|
||||||
|
|
||||||
# PORT is only used when running the bot without docker compose
|
|
||||||
ROOT_PORT = 8000
|
|
||||||
|
|
||||||
# Directory to your models (llama.cpp specfic settings)
|
|
||||||
DATA_DIR = /home/USERNAME/weights
|
|
||||||
|
|
||||||
# Enable Expirmental Message Caches (Limited to single session)
|
|
||||||
# Cache will use ~1.4 GB or MORE of RAM. ONLY ENABLE IF YOUR SYSTEM CAN HANDLE THIS.
|
|
||||||
CACHE = 1
|
|
||||||
|
|
||||||
CACHE_TYPE = "disk"
|
|
||||||
|
|
||||||
# Set number of threads to use, currently, a standard thread will utilize 1 whole core
|
|
||||||
# I usually will set this between all cores I physcally have OR 2 cores less to allow for other processes.
|
|
||||||
N_THREADS = 4
|
|
||||||
|
|
||||||
# Always use MMAP unless you know what you are doing
|
|
||||||
USE_MMAP=1
|
|
||||||
|
|
||||||
# Only use MLOCK if you know what it does!
|
|
||||||
USE_MLOCK=0
|
|
||||||
|
|
||||||
# The higher the number the more hard core.
|
|
||||||
REPEAT_PENALTY=1
|
|
||||||
|
|
||||||
# GPU SPECIFIC SETTINGS BELOW
|
|
||||||
|
|
||||||
GPU=1
|
|
||||||
|
|
||||||
N_GPU_LAYERS=32
|
|
||||||
|
|
||||||
PYTHONUNBUFFERED=1
|
|
@ -11,7 +11,7 @@ services:
|
|||||||
- ${DATA_DIR}:/usr/src/app/models
|
- ${DATA_DIR}:/usr/src/app/models
|
||||||
environment:
|
environment:
|
||||||
- HOST=llama-python-server
|
- HOST=llama-python-server
|
||||||
- MODEL=./models/ggml-vic7b-q4_0.bin
|
- MODEL=./models/vicuna-7b-1.1.ggmlv3.q6_K.bin
|
||||||
llama-python-djs-bot:
|
llama-python-djs-bot:
|
||||||
container_name: llama-python-djs-bot
|
container_name: llama-python-djs-bot
|
||||||
restart: unless-stopped
|
restart: unless-stopped
|
||||||
|
26
llamabot.js
26
llamabot.js
@ -435,13 +435,22 @@ async function generateResponse(conversation, message) {
|
|||||||
// Grab the REFRESH_INTERVAL from ENV if not exist, lets use 7 (seconds)
|
// Grab the REFRESH_INTERVAL from ENV if not exist, lets use 7 (seconds)
|
||||||
const refreshInterval = setInterval(showSystemLoad, (process.env.REFRESH_INTERVAL || 7) * 1000);
|
const refreshInterval = setInterval(showSystemLoad, (process.env.REFRESH_INTERVAL || 7) * 1000);
|
||||||
|
|
||||||
// Handle context size
|
function countLlamaTokens(messages) {
|
||||||
// Encode the messages
|
let totalTokens = 0;
|
||||||
const encodedTokens = llamaTokenizer.encode(messagesCopy);
|
|
||||||
|
|
||||||
// Check the token length
|
for (const message of messages) {
|
||||||
const tokenLength = encodedTokens.length;
|
if (message.role === 'user') {
|
||||||
console.log(`CTX SIZE: ${tokenLength}`);
|
const encodedTokens = llamaTokenizer.encode(message.content);
|
||||||
|
totalTokens += encodedTokens.length;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return totalTokens;
|
||||||
|
}
|
||||||
|
|
||||||
|
let totalTokens = countLlamaTokens(messagesCopy);
|
||||||
|
console.log(`Total Llama tokens: ${totalTokens}`);
|
||||||
|
let tokenLength = totalTokens
|
||||||
|
|
||||||
// Remove older conversations if necessary
|
// Remove older conversations if necessary
|
||||||
const maxLength = 2048;
|
const maxLength = 2048;
|
||||||
@ -470,8 +479,6 @@ async function generateResponse(conversation, message) {
|
|||||||
// Check the updated token length
|
// Check the updated token length
|
||||||
}
|
}
|
||||||
|
|
||||||
console.log(`CTX SIZE AFTER PROCESSING: ${llamaTokenizer.encode(messagesCopy).length}`);
|
|
||||||
|
|
||||||
// Sending request to our API
|
// Sending request to our API
|
||||||
const response = await fetch(`http://${process.env.ROOT_IP}:${process.env.ROOT_PORT}/v1/chat/completions`, {
|
const response = await fetch(`http://${process.env.ROOT_IP}:${process.env.ROOT_PORT}/v1/chat/completions`, {
|
||||||
method: 'POST',
|
method: 'POST',
|
||||||
@ -496,9 +503,12 @@ async function generateResponse(conversation, message) {
|
|||||||
|
|
||||||
// clear the interval, replace the "please wait" message with the response, and update the message
|
// clear the interval, replace the "please wait" message with the response, and update the message
|
||||||
console.log(responseText);
|
console.log(responseText);
|
||||||
|
if (time > 2) {
|
||||||
await botMessage.delete()
|
await botMessage.delete()
|
||||||
clearInterval(refreshInterval);
|
clearInterval(refreshInterval);
|
||||||
botMessage = null;
|
botMessage = null;
|
||||||
|
}
|
||||||
|
|
||||||
return responseText;
|
return responseText;
|
||||||
|
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
|
22
package-lock.json
generated
22
package-lock.json
generated
@ -13,10 +13,12 @@
|
|||||||
"cpu-stat": "^2.0.1",
|
"cpu-stat": "^2.0.1",
|
||||||
"discord.js": "^14.9.0",
|
"discord.js": "^14.9.0",
|
||||||
"dotenv": "^16.0.3",
|
"dotenv": "^16.0.3",
|
||||||
|
"gpt-tokenizer": "^2.1.1",
|
||||||
"llama-tokenizer-js": "^1.0.0",
|
"llama-tokenizer-js": "^1.0.0",
|
||||||
"node-fetch": "^3.3.1",
|
"node-fetch": "^3.3.1",
|
||||||
"node-nvidia-smi": "^1.0.0",
|
"node-nvidia-smi": "^1.0.0",
|
||||||
"os": "^0.1.2"
|
"os": "^0.1.2",
|
||||||
|
"tiktoken": "^1.0.10"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"node_modules/@discordjs/builders": {
|
"node_modules/@discordjs/builders": {
|
||||||
@ -405,6 +407,14 @@
|
|||||||
"node": ">=12.20.0"
|
"node": ">=12.20.0"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"node_modules/gpt-tokenizer": {
|
||||||
|
"version": "2.1.1",
|
||||||
|
"resolved": "https://registry.npmjs.org/gpt-tokenizer/-/gpt-tokenizer-2.1.1.tgz",
|
||||||
|
"integrity": "sha512-WlX+vj6aPaZ71U6Bf18fem+5k58zlgh2a4nbc7KHy6aGVIyq3nCh709b/8momu34sV/5t/SpzWi8LayWD9uyDw==",
|
||||||
|
"dependencies": {
|
||||||
|
"rfc4648": "^1.5.2"
|
||||||
|
}
|
||||||
|
},
|
||||||
"node_modules/htmlparser2": {
|
"node_modules/htmlparser2": {
|
||||||
"version": "8.0.2",
|
"version": "8.0.2",
|
||||||
"resolved": "https://registry.npmjs.org/htmlparser2/-/htmlparser2-8.0.2.tgz",
|
"resolved": "https://registry.npmjs.org/htmlparser2/-/htmlparser2-8.0.2.tgz",
|
||||||
@ -587,6 +597,11 @@
|
|||||||
"url": "https://github.com/sponsors/Borewit"
|
"url": "https://github.com/sponsors/Borewit"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"node_modules/rfc4648": {
|
||||||
|
"version": "1.5.2",
|
||||||
|
"resolved": "https://registry.npmjs.org/rfc4648/-/rfc4648-1.5.2.tgz",
|
||||||
|
"integrity": "sha512-tLOizhR6YGovrEBLatX1sdcuhoSCXddw3mqNVAcKxGJ+J0hFeJ+SjeWCv5UPA/WU3YzWPPuCVYgXBKZUPGpKtg=="
|
||||||
|
},
|
||||||
"node_modules/safe-buffer": {
|
"node_modules/safe-buffer": {
|
||||||
"version": "5.2.1",
|
"version": "5.2.1",
|
||||||
"resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.2.1.tgz",
|
"resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.2.1.tgz",
|
||||||
@ -643,6 +658,11 @@
|
|||||||
"url": "https://github.com/sponsors/Borewit"
|
"url": "https://github.com/sponsors/Borewit"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"node_modules/tiktoken": {
|
||||||
|
"version": "1.0.10",
|
||||||
|
"resolved": "https://registry.npmjs.org/tiktoken/-/tiktoken-1.0.10.tgz",
|
||||||
|
"integrity": "sha512-gF8ndTCNu7WcRFbl1UUWaFIB4CTXmHzS3tRYdyUYF7x3C6YR6Evoao4zhKDmWIwv2PzNbzoQMV8Pxt+17lEDbA=="
|
||||||
|
},
|
||||||
"node_modules/token-types": {
|
"node_modules/token-types": {
|
||||||
"version": "5.0.1",
|
"version": "5.0.1",
|
||||||
"resolved": "https://registry.npmjs.org/token-types/-/token-types-5.0.1.tgz",
|
"resolved": "https://registry.npmjs.org/token-types/-/token-types-5.0.1.tgz",
|
||||||
|
@ -14,9 +14,11 @@
|
|||||||
"cpu-stat": "^2.0.1",
|
"cpu-stat": "^2.0.1",
|
||||||
"discord.js": "^14.9.0",
|
"discord.js": "^14.9.0",
|
||||||
"dotenv": "^16.0.3",
|
"dotenv": "^16.0.3",
|
||||||
|
"gpt-tokenizer": "^2.1.1",
|
||||||
"llama-tokenizer-js": "^1.0.0",
|
"llama-tokenizer-js": "^1.0.0",
|
||||||
"node-fetch": "^3.3.1",
|
"node-fetch": "^3.3.1",
|
||||||
"node-nvidia-smi": "^1.0.0",
|
"node-nvidia-smi": "^1.0.0",
|
||||||
"os": "^0.1.2"
|
"os": "^0.1.2",
|
||||||
|
"tiktoken": "^1.0.10"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -6,6 +6,6 @@ RUN apt-get update; \
|
|||||||
|
|
||||||
WORKDIR /usr/src/app
|
WORKDIR /usr/src/app
|
||||||
|
|
||||||
RUN pip3 install llama-cpp-python[server]
|
RUN CMAKE_ARGS="-DLLAMA_METAL=on" FORCE_CMAKE=1 pip install llama-cpp-python[server]
|
||||||
|
|
||||||
CMD python3 -m llama_cpp.server
|
CMD python3 -m llama_cpp.server
|
Loading…
Reference in New Issue
Block a user