adding NVIDIA GPU Support with Stats

This commit is contained in:
Raven Scott 2023-05-19 21:32:21 +02:00
parent 4b090592ad
commit 368004f10b
7 changed files with 244 additions and 19 deletions

17
Dockerfile.gpu Normal file
View File

@ -0,0 +1,17 @@
FROM nvidia/cuda:12.1.1-devel-ubuntu20.04
ENV DEBIAN_FRONTEND noninteractive
WORKDIR /app
RUN apt update
RUN apt install sudo curl -y
RUN curl -fsSL https://deb.nodesource.com/setup_18.x | sudo -E bash - && sudo apt-get install -y nodejs
COPY package*.json ./
RUN npm install --omit=dev
COPY . .
CMD node llamabot.js

View File

@ -78,4 +78,24 @@ This will automatically configure the API for you as well as the bot in two sepe
5. `docker compose up -d` 5. `docker compose up -d`
# Docker Compose with GPU
This will automatically configure the API that supports cuBLAS and GPU inference for you as well as the bot in two seperate containers within a stack.
1. `git clone https://git.ssh.surf/snxraven/llama-cpp-python-djs-bot.git` - Clone the repo
2. `mv docker-compose.yml docker-compose.nogpu.yml; mv docker-compose.gpu.yml docker-compose.yml;` - Move nongpu compose out of the way, Enable GPU Support
3. `mv Dockerfile Dockerfile.nongpu; mv Dockerfile.gpu Dockerfile;` - Move nongpu Dockerfile out of the way, enable GPU Support
3. `cp default.gpu.env .env` - Copy the default GPU .env to its proper location
4. Set DATA_DIR in .env to the exact location of your model files.
5. Edit docker-compose.yaml MODEL to ensure the correct model bin is set
6. set N_GPU_LAYERS to the amount of layers you would like to export to GPU
7. `docker compose up -d`
Want to make this better? Issue a pull request! Want to make this better? Issue a pull request!

54
default.gpu.env Normal file
View File

@ -0,0 +1,54 @@
# Discord Token
THE_TOKEN = "DISCORD_TOKEN_HERE"
# The Channel IDs the bot will operate in seperated by commas
CHANNEL_IDS = 1094494101631680653,1094628334727614605
# The INIT prompt for all conversations.
INIT_PROMPT = "Assistant name: ChatAI. You code, write and provide any information without any mistakes."
# Loading Emebed Refresh Timing
REFRESH_INTERVAL=10
# When a message is too large for discord we chunk the response into seperate messages.
# To ensure we do not rate limit the bot we send these at a delay interval.
# DEFAULT: 3 a good setting is between 3 and 7 seconds.
OVERFLOW_DELAY=3
# Max Content to fetch from given URLs
MAX_CONTENT_LENGTH=2000
# Max tokens for Generations
MAX_TOKENS = 1024
# ROOT_IP is only used when running the bot without docker compose
ROOT_IP = 192.168.0.15
# PORT is only used when running the bot without docker compose
ROOT_PORT = 8000
# Directory to your models (llama.cpp specfic settings)
DATA_DIR = /home/USERNAME/weights
# Enable Expirmental Message Caches (Limited to single session)
# Cache will use ~1.4 GB or MORE of RAM. ONLY ENABLE IF YOUR SYSTEM CAN HANDLE THIS.
CACHE = 1
# Set number of threads to use, currently, a standard thread will utilize 1 whole core
# I usually will set this between all cores I physcally have OR 2 cores less to allow for other processes.
N_THREADS = 4
# Always use MMAP unless you know what you are doing
USE_MMAP=1
# Only use MLOCK if you know what it does!
USE_MLOCK=0
# The higher the number the more hard core.
REPEAT_PENALTY=1
# GPU SPECIFIC SETTINGS BELOW
GPU=1
N_GPU_LAYERS=32

34
docker-compose.gpu.yml Normal file
View File

@ -0,0 +1,34 @@
version: '3.9'
services:
llama-python-server:
container_name: llama-python-gpu-server
restart: unless-stopped
build:
context: ./gpu-server
env_file: .env
volumes:
- ${DATA_DIR}:/usr/src/app/models
environment:
- HOST=llama-python-gpu-server
- MODEL=./models/ggml-vic7b-q5_1.bin.1
- NVIDIA_VISIBLE_DEVICES=all
runtime: nvidia
llama-python-djs-bot:
container_name: llama-python-djs-bot
restart: unless-stopped
build:
context: .
depends_on:
- llama-python-server
environment:
- THE_TOKEN
- REFRESH_INTERVAL
- CHANNEL_IDS
- GPU
- ROOT_IP=llama-python-server
- ROOT_PORT=8000
- INIT_PROMPT='Assistant Name':' ChatAI. You code, write and provide any information without any mistakes.'
- NVIDIA_VISIBLE_DEVICES=all
runtime: nvidia

32
gpu-server/Dockerfile Normal file
View File

@ -0,0 +1,32 @@
FROM nvidia/cuda:12.1.1-devel-ubuntu20.04
# Install the deps
ENV DEBIAN_FRONTEND=noninteractive
ENV TZ=Etc/GMT
RUN apt-get update && apt-get install -y --no-install-recommends python3 python3-pip git cmake
# Get llama-cpp-python
WORKDIR /usr/src
RUN git clone https://github.com/abetlen/llama-cpp-python.git
RUN mv llama-cpp-python app
WORKDIR /usr/src/app
#RUN git clone https://github.com/gjmulder/llama-cpp-python.git
#RUN git checkout improved-unit-tests
# Patch .gitmodules to use HTTPS
RUN sed -i 's|git@github.com:ggerganov/llama.cpp.git|https://github.com/ggerganov/llama.cpp.git|' .gitmodules
RUN git submodule update --init --recursive
# Build llama-cpp-python w/CuBLAS
RUN grep --colour "n_batch" ./llama_cpp/server/*.py
RUN pip install scikit-build fastapi sse_starlette uvicorn && LLAMA_CUBLAS=1 python3 setup.py develop
# We need to set the host to 0.0.0.0 to allow outside access
ENV HOST 0.0.0.0
# Run the server
CMD python3 -m llama_cpp.server

View File

@ -5,6 +5,8 @@ import { resetResponses, userResetMessages } from './assets/resetMessages.js';
import { errorMessages, busyResponses } from './assets/errorMessages.js'; import { errorMessages, busyResponses } from './assets/errorMessages.js';
import cpuStat from 'cpu-stat'; import cpuStat from 'cpu-stat';
import os from 'os'; import os from 'os';
import smi from 'node-nvidia-smi';
import { import {
Client, Client,
@ -154,9 +156,9 @@ client.on('messageCreate', async (message) => {
// if we are over the discord char limit we need chunks... // if we are over the discord char limit we need chunks...
if (response.length > limit) { if (response.length > limit) {
const chunks = response.match(new RegExp(`.{1,${limit}}`, "g")); const chunks = response.match(new RegExp(`.{1,${limit}}`, "g"));
if (chunks.length >= 15) return await message.channel.send("Response chunks too large. Try again"); if (chunks.length >= 15) return await message.channel.send("Response chunks too large. Try again");
for (let i = 0; i < chunks.length; i++) { for (let i = 0; i < chunks.length; i++) {
@ -240,7 +242,7 @@ async function generateResponse(conversation, message) {
// Append a new line and the new content to the existing content of the last message // Append a new line and the new content to the existing content of the last message
conversation.messages[lastMessageIndex].content += "\n" + response; conversation.messages[lastMessageIndex].content += "\n" + response;
console.log("A URL was provided, response: " + response) console.log("A URL was provided, response: " + response)
} catch (err) { } catch (err) {
@ -271,10 +273,39 @@ async function generateResponse(conversation, message) {
const totalMemory = os.totalmem() / 1024 / 1024 / 1024; const totalMemory = os.totalmem() / 1024 / 1024 / 1024;
const usedMemory = totalMemory - freeMemory; const usedMemory = totalMemory - freeMemory;
const embedData = { let filedsData = [
color: 0x0099ff, {
title: 'Please wait.. I am thinking...', name: 'System Load',
fields: [ value: `${systemLoad.toFixed(2)}%`,
},
{
name: 'Memory Usage',
value: `${usedMemory.toFixed(2)} GB / ${totalMemory.toFixed(2)} GB`,
},
{
name: 'Time',
value: `~${time} seconds.`,
},
]
if (process.env.GPU == 1) {
smi(function (err, data) {
if (err) {
// Handle error if smi function fails
console.error(err);
return;
}
let utilization = data.nvidia_smi_log.gpu.utilization;
let gpuUtilization = utilization.gpu_util;
let memoryUtilization = utilization.memory_util;
let gpuTemp = data.nvidia_smi_log.gpu.temperature.gpu_temp;
// These are not used until nvidia-docker fixes their support
let gpuTarget = data.nvidia_smi_log.gpu.temperature.gpu_target_temperature;
let gpuFanSpeed = data.nvidia_smi_log.gpu.fan_speed;
let filedsData = [
{ {
name: 'System Load', name: 'System Load',
value: `${systemLoad.toFixed(2)}%`, value: `${systemLoad.toFixed(2)}%`,
@ -283,21 +314,56 @@ async function generateResponse(conversation, message) {
name: 'Memory Usage', name: 'Memory Usage',
value: `${usedMemory.toFixed(2)} GB / ${totalMemory.toFixed(2)} GB`, value: `${usedMemory.toFixed(2)} GB / ${totalMemory.toFixed(2)} GB`,
}, },
{
name: 'GPU Utilization',
value: `${gpuUtilization}`,
},
{
name: 'Memory Utilization',
value: `${memoryUtilization}`,
},
{
name: 'GPU Temperature',
value: `${gpuTemp}`,
},
{ {
name: 'Time', name: 'Time',
value: `~${time} seconds.`, value: `~${time} seconds.`,
}, },
], ];
};
const embedData = {
// if the message object doesn't exist, create it color: 0x0099ff,
if (!botMessage) { title: 'Please wait.. I am thinking...',
(async () => { fields: filedsData,
};
// if the message object doesn't exist, create it
if (!botMessage) {
(async () => {
botMessage = await message.channel.send({ embeds: [embedData] });
})();
} else {
botMessage.edit({ embeds: [embedData] }); // otherwise, update the message
}
});
} else {
const embedData = {
color: 0x0099ff,
title: 'Please wait.. I am thinking...',
fields: filedsData, // It seems like a typo, it should be `filedsData` instead of `filedsData`
};
// if the message object doesn't exist, create it
if (!botMessage) {
(async () => {
botMessage = await message.channel.send({ embeds: [embedData] }); botMessage = await message.channel.send({ embeds: [embedData] });
})(); })();
} else { } else {
botMessage.edit({ embeds: [embedData] }); // otherwise, update the message botMessage.edit({ embeds: [embedData] }); // otherwise, update the message
} }
}
}); });
}; };
@ -344,4 +410,4 @@ async function generateResponse(conversation, message) {
} }
} }
client.login(process.env.THE_TOKEN); // Replace with your bot token client.login(process.env.THE_TOKEN); // Replace with your bot token

View File

@ -15,6 +15,8 @@
"dotenv": "^16.0.3", "dotenv": "^16.0.3",
"node-fetch": "^3.3.1", "node-fetch": "^3.3.1",
"os": "^0.1.2", "os": "^0.1.2",
"cpu-stat": "^2.0.1" "cpu-stat": "^2.0.1",
"node-nvidia-smi": "^1.0.0"
} }
} }