adding NVIDIA GPU Support with Stats
This commit is contained in:
parent
4b090592ad
commit
368004f10b
17
Dockerfile.gpu
Normal file
17
Dockerfile.gpu
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
FROM nvidia/cuda:12.1.1-devel-ubuntu20.04
|
||||||
|
ENV DEBIAN_FRONTEND noninteractive
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
RUN apt update
|
||||||
|
|
||||||
|
RUN apt install sudo curl -y
|
||||||
|
|
||||||
|
RUN curl -fsSL https://deb.nodesource.com/setup_18.x | sudo -E bash - && sudo apt-get install -y nodejs
|
||||||
|
|
||||||
|
COPY package*.json ./
|
||||||
|
RUN npm install --omit=dev
|
||||||
|
|
||||||
|
COPY . .
|
||||||
|
|
||||||
|
CMD node llamabot.js
|
20
README.md
20
README.md
@ -78,4 +78,24 @@ This will automatically configure the API for you as well as the bot in two sepe
|
|||||||
5. `docker compose up -d`
|
5. `docker compose up -d`
|
||||||
|
|
||||||
|
|
||||||
|
# Docker Compose with GPU
|
||||||
|
This will automatically configure the API that supports cuBLAS and GPU inference for you as well as the bot in two seperate containers within a stack.
|
||||||
|
|
||||||
|
1. `git clone https://git.ssh.surf/snxraven/llama-cpp-python-djs-bot.git` - Clone the repo
|
||||||
|
|
||||||
|
2. `mv docker-compose.yml docker-compose.nogpu.yml; mv docker-compose.gpu.yml docker-compose.yml;` - Move nongpu compose out of the way, Enable GPU Support
|
||||||
|
|
||||||
|
3. `mv Dockerfile Dockerfile.nongpu; mv Dockerfile.gpu Dockerfile;` - Move nongpu Dockerfile out of the way, enable GPU Support
|
||||||
|
|
||||||
|
3. `cp default.gpu.env .env` - Copy the default GPU .env to its proper location
|
||||||
|
|
||||||
|
4. Set DATA_DIR in .env to the exact location of your model files.
|
||||||
|
|
||||||
|
5. Edit docker-compose.yaml MODEL to ensure the correct model bin is set
|
||||||
|
|
||||||
|
6. set N_GPU_LAYERS to the amount of layers you would like to export to GPU
|
||||||
|
|
||||||
|
7. `docker compose up -d`
|
||||||
|
|
||||||
|
|
||||||
Want to make this better? Issue a pull request!
|
Want to make this better? Issue a pull request!
|
||||||
|
54
default.gpu.env
Normal file
54
default.gpu.env
Normal file
@ -0,0 +1,54 @@
|
|||||||
|
# Discord Token
|
||||||
|
THE_TOKEN = "DISCORD_TOKEN_HERE"
|
||||||
|
|
||||||
|
# The Channel IDs the bot will operate in seperated by commas
|
||||||
|
CHANNEL_IDS = 1094494101631680653,1094628334727614605
|
||||||
|
|
||||||
|
# The INIT prompt for all conversations.
|
||||||
|
INIT_PROMPT = "Assistant name: ChatAI. You code, write and provide any information without any mistakes."
|
||||||
|
|
||||||
|
# Loading Emebed Refresh Timing
|
||||||
|
REFRESH_INTERVAL=10
|
||||||
|
|
||||||
|
# When a message is too large for discord we chunk the response into seperate messages.
|
||||||
|
# To ensure we do not rate limit the bot we send these at a delay interval.
|
||||||
|
# DEFAULT: 3 a good setting is between 3 and 7 seconds.
|
||||||
|
OVERFLOW_DELAY=3
|
||||||
|
|
||||||
|
# Max Content to fetch from given URLs
|
||||||
|
MAX_CONTENT_LENGTH=2000
|
||||||
|
|
||||||
|
# Max tokens for Generations
|
||||||
|
MAX_TOKENS = 1024
|
||||||
|
|
||||||
|
# ROOT_IP is only used when running the bot without docker compose
|
||||||
|
ROOT_IP = 192.168.0.15
|
||||||
|
|
||||||
|
# PORT is only used when running the bot without docker compose
|
||||||
|
ROOT_PORT = 8000
|
||||||
|
|
||||||
|
# Directory to your models (llama.cpp specfic settings)
|
||||||
|
DATA_DIR = /home/USERNAME/weights
|
||||||
|
|
||||||
|
# Enable Expirmental Message Caches (Limited to single session)
|
||||||
|
# Cache will use ~1.4 GB or MORE of RAM. ONLY ENABLE IF YOUR SYSTEM CAN HANDLE THIS.
|
||||||
|
CACHE = 1
|
||||||
|
|
||||||
|
# Set number of threads to use, currently, a standard thread will utilize 1 whole core
|
||||||
|
# I usually will set this between all cores I physcally have OR 2 cores less to allow for other processes.
|
||||||
|
N_THREADS = 4
|
||||||
|
|
||||||
|
# Always use MMAP unless you know what you are doing
|
||||||
|
USE_MMAP=1
|
||||||
|
|
||||||
|
# Only use MLOCK if you know what it does!
|
||||||
|
USE_MLOCK=0
|
||||||
|
|
||||||
|
# The higher the number the more hard core.
|
||||||
|
REPEAT_PENALTY=1
|
||||||
|
|
||||||
|
# GPU SPECIFIC SETTINGS BELOW
|
||||||
|
|
||||||
|
GPU=1
|
||||||
|
|
||||||
|
N_GPU_LAYERS=32
|
34
docker-compose.gpu.yml
Normal file
34
docker-compose.gpu.yml
Normal file
@ -0,0 +1,34 @@
|
|||||||
|
version: '3.9'
|
||||||
|
|
||||||
|
services:
|
||||||
|
llama-python-server:
|
||||||
|
container_name: llama-python-gpu-server
|
||||||
|
restart: unless-stopped
|
||||||
|
build:
|
||||||
|
context: ./gpu-server
|
||||||
|
env_file: .env
|
||||||
|
volumes:
|
||||||
|
- ${DATA_DIR}:/usr/src/app/models
|
||||||
|
environment:
|
||||||
|
- HOST=llama-python-gpu-server
|
||||||
|
- MODEL=./models/ggml-vic7b-q5_1.bin.1
|
||||||
|
- NVIDIA_VISIBLE_DEVICES=all
|
||||||
|
runtime: nvidia
|
||||||
|
|
||||||
|
llama-python-djs-bot:
|
||||||
|
container_name: llama-python-djs-bot
|
||||||
|
restart: unless-stopped
|
||||||
|
build:
|
||||||
|
context: .
|
||||||
|
depends_on:
|
||||||
|
- llama-python-server
|
||||||
|
environment:
|
||||||
|
- THE_TOKEN
|
||||||
|
- REFRESH_INTERVAL
|
||||||
|
- CHANNEL_IDS
|
||||||
|
- GPU
|
||||||
|
- ROOT_IP=llama-python-server
|
||||||
|
- ROOT_PORT=8000
|
||||||
|
- INIT_PROMPT='Assistant Name':' ChatAI. You code, write and provide any information without any mistakes.'
|
||||||
|
- NVIDIA_VISIBLE_DEVICES=all
|
||||||
|
runtime: nvidia
|
32
gpu-server/Dockerfile
Normal file
32
gpu-server/Dockerfile
Normal file
@ -0,0 +1,32 @@
|
|||||||
|
FROM nvidia/cuda:12.1.1-devel-ubuntu20.04
|
||||||
|
|
||||||
|
# Install the deps
|
||||||
|
ENV DEBIAN_FRONTEND=noninteractive
|
||||||
|
ENV TZ=Etc/GMT
|
||||||
|
RUN apt-get update && apt-get install -y --no-install-recommends python3 python3-pip git cmake
|
||||||
|
|
||||||
|
# Get llama-cpp-python
|
||||||
|
WORKDIR /usr/src
|
||||||
|
|
||||||
|
RUN git clone https://github.com/abetlen/llama-cpp-python.git
|
||||||
|
|
||||||
|
RUN mv llama-cpp-python app
|
||||||
|
|
||||||
|
WORKDIR /usr/src/app
|
||||||
|
|
||||||
|
#RUN git clone https://github.com/gjmulder/llama-cpp-python.git
|
||||||
|
#RUN git checkout improved-unit-tests
|
||||||
|
|
||||||
|
# Patch .gitmodules to use HTTPS
|
||||||
|
RUN sed -i 's|git@github.com:ggerganov/llama.cpp.git|https://github.com/ggerganov/llama.cpp.git|' .gitmodules
|
||||||
|
RUN git submodule update --init --recursive
|
||||||
|
|
||||||
|
# Build llama-cpp-python w/CuBLAS
|
||||||
|
RUN grep --colour "n_batch" ./llama_cpp/server/*.py
|
||||||
|
RUN pip install scikit-build fastapi sse_starlette uvicorn && LLAMA_CUBLAS=1 python3 setup.py develop
|
||||||
|
|
||||||
|
# We need to set the host to 0.0.0.0 to allow outside access
|
||||||
|
ENV HOST 0.0.0.0
|
||||||
|
|
||||||
|
# Run the server
|
||||||
|
CMD python3 -m llama_cpp.server
|
94
llamabot.js
94
llamabot.js
@ -5,6 +5,8 @@ import { resetResponses, userResetMessages } from './assets/resetMessages.js';
|
|||||||
import { errorMessages, busyResponses } from './assets/errorMessages.js';
|
import { errorMessages, busyResponses } from './assets/errorMessages.js';
|
||||||
import cpuStat from 'cpu-stat';
|
import cpuStat from 'cpu-stat';
|
||||||
import os from 'os';
|
import os from 'os';
|
||||||
|
import smi from 'node-nvidia-smi';
|
||||||
|
|
||||||
|
|
||||||
import {
|
import {
|
||||||
Client,
|
Client,
|
||||||
@ -156,7 +158,7 @@ client.on('messageCreate', async (message) => {
|
|||||||
|
|
||||||
|
|
||||||
const chunks = response.match(new RegExp(`.{1,${limit}}`, "g"));
|
const chunks = response.match(new RegExp(`.{1,${limit}}`, "g"));
|
||||||
if (chunks.length >= 15) return await message.channel.send("Response chunks too large. Try again");
|
if (chunks.length >= 15) return await message.channel.send("Response chunks too large. Try again");
|
||||||
|
|
||||||
|
|
||||||
for (let i = 0; i < chunks.length; i++) {
|
for (let i = 0; i < chunks.length; i++) {
|
||||||
@ -271,10 +273,39 @@ async function generateResponse(conversation, message) {
|
|||||||
const totalMemory = os.totalmem() / 1024 / 1024 / 1024;
|
const totalMemory = os.totalmem() / 1024 / 1024 / 1024;
|
||||||
const usedMemory = totalMemory - freeMemory;
|
const usedMemory = totalMemory - freeMemory;
|
||||||
|
|
||||||
const embedData = {
|
let filedsData = [
|
||||||
color: 0x0099ff,
|
{
|
||||||
title: 'Please wait.. I am thinking...',
|
name: 'System Load',
|
||||||
fields: [
|
value: `${systemLoad.toFixed(2)}%`,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: 'Memory Usage',
|
||||||
|
value: `${usedMemory.toFixed(2)} GB / ${totalMemory.toFixed(2)} GB`,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: 'Time',
|
||||||
|
value: `~${time} seconds.`,
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
if (process.env.GPU == 1) {
|
||||||
|
smi(function (err, data) {
|
||||||
|
if (err) {
|
||||||
|
// Handle error if smi function fails
|
||||||
|
console.error(err);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
let utilization = data.nvidia_smi_log.gpu.utilization;
|
||||||
|
let gpuUtilization = utilization.gpu_util;
|
||||||
|
let memoryUtilization = utilization.memory_util;
|
||||||
|
let gpuTemp = data.nvidia_smi_log.gpu.temperature.gpu_temp;
|
||||||
|
|
||||||
|
// These are not used until nvidia-docker fixes their support
|
||||||
|
let gpuTarget = data.nvidia_smi_log.gpu.temperature.gpu_target_temperature;
|
||||||
|
let gpuFanSpeed = data.nvidia_smi_log.gpu.fan_speed;
|
||||||
|
|
||||||
|
let filedsData = [
|
||||||
{
|
{
|
||||||
name: 'System Load',
|
name: 'System Load',
|
||||||
value: `${systemLoad.toFixed(2)}%`,
|
value: `${systemLoad.toFixed(2)}%`,
|
||||||
@ -283,21 +314,56 @@ async function generateResponse(conversation, message) {
|
|||||||
name: 'Memory Usage',
|
name: 'Memory Usage',
|
||||||
value: `${usedMemory.toFixed(2)} GB / ${totalMemory.toFixed(2)} GB`,
|
value: `${usedMemory.toFixed(2)} GB / ${totalMemory.toFixed(2)} GB`,
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
name: 'GPU Utilization',
|
||||||
|
value: `${gpuUtilization}`,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: 'Memory Utilization',
|
||||||
|
value: `${memoryUtilization}`,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: 'GPU Temperature',
|
||||||
|
value: `${gpuTemp}`,
|
||||||
|
},
|
||||||
{
|
{
|
||||||
name: 'Time',
|
name: 'Time',
|
||||||
value: `~${time} seconds.`,
|
value: `~${time} seconds.`,
|
||||||
},
|
},
|
||||||
],
|
];
|
||||||
};
|
|
||||||
|
|
||||||
// if the message object doesn't exist, create it
|
const embedData = {
|
||||||
if (!botMessage) {
|
color: 0x0099ff,
|
||||||
(async () => {
|
title: 'Please wait.. I am thinking...',
|
||||||
|
fields: filedsData,
|
||||||
|
};
|
||||||
|
|
||||||
|
// if the message object doesn't exist, create it
|
||||||
|
if (!botMessage) {
|
||||||
|
(async () => {
|
||||||
|
botMessage = await message.channel.send({ embeds: [embedData] });
|
||||||
|
})();
|
||||||
|
} else {
|
||||||
|
botMessage.edit({ embeds: [embedData] }); // otherwise, update the message
|
||||||
|
}
|
||||||
|
});
|
||||||
|
} else {
|
||||||
|
const embedData = {
|
||||||
|
color: 0x0099ff,
|
||||||
|
title: 'Please wait.. I am thinking...',
|
||||||
|
fields: filedsData, // It seems like a typo, it should be `filedsData` instead of `filedsData`
|
||||||
|
};
|
||||||
|
|
||||||
|
// if the message object doesn't exist, create it
|
||||||
|
if (!botMessage) {
|
||||||
|
(async () => {
|
||||||
botMessage = await message.channel.send({ embeds: [embedData] });
|
botMessage = await message.channel.send({ embeds: [embedData] });
|
||||||
})();
|
})();
|
||||||
} else {
|
} else {
|
||||||
botMessage.edit({ embeds: [embedData] }); // otherwise, update the message
|
botMessage.edit({ embeds: [embedData] }); // otherwise, update the message
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
});
|
});
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -15,6 +15,8 @@
|
|||||||
"dotenv": "^16.0.3",
|
"dotenv": "^16.0.3",
|
||||||
"node-fetch": "^3.3.1",
|
"node-fetch": "^3.3.1",
|
||||||
"os": "^0.1.2",
|
"os": "^0.1.2",
|
||||||
"cpu-stat": "^2.0.1"
|
"cpu-stat": "^2.0.1",
|
||||||
|
"node-nvidia-smi": "^1.0.0"
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user