adding NVIDIA GPU Support with Stats
This commit is contained in:
parent
4b090592ad
commit
368004f10b
17
Dockerfile.gpu
Normal file
17
Dockerfile.gpu
Normal file
@ -0,0 +1,17 @@
|
||||
FROM nvidia/cuda:12.1.1-devel-ubuntu20.04
|
||||
ENV DEBIAN_FRONTEND noninteractive
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
RUN apt update
|
||||
|
||||
RUN apt install sudo curl -y
|
||||
|
||||
RUN curl -fsSL https://deb.nodesource.com/setup_18.x | sudo -E bash - && sudo apt-get install -y nodejs
|
||||
|
||||
COPY package*.json ./
|
||||
RUN npm install --omit=dev
|
||||
|
||||
COPY . .
|
||||
|
||||
CMD node llamabot.js
|
20
README.md
20
README.md
@ -78,4 +78,24 @@ This will automatically configure the API for you as well as the bot in two sepe
|
||||
5. `docker compose up -d`
|
||||
|
||||
|
||||
# Docker Compose with GPU
|
||||
This will automatically configure the API that supports cuBLAS and GPU inference for you as well as the bot in two seperate containers within a stack.
|
||||
|
||||
1. `git clone https://git.ssh.surf/snxraven/llama-cpp-python-djs-bot.git` - Clone the repo
|
||||
|
||||
2. `mv docker-compose.yml docker-compose.nogpu.yml; mv docker-compose.gpu.yml docker-compose.yml;` - Move nongpu compose out of the way, Enable GPU Support
|
||||
|
||||
3. `mv Dockerfile Dockerfile.nongpu; mv Dockerfile.gpu Dockerfile;` - Move nongpu Dockerfile out of the way, enable GPU Support
|
||||
|
||||
3. `cp default.gpu.env .env` - Copy the default GPU .env to its proper location
|
||||
|
||||
4. Set DATA_DIR in .env to the exact location of your model files.
|
||||
|
||||
5. Edit docker-compose.yaml MODEL to ensure the correct model bin is set
|
||||
|
||||
6. set N_GPU_LAYERS to the amount of layers you would like to export to GPU
|
||||
|
||||
7. `docker compose up -d`
|
||||
|
||||
|
||||
Want to make this better? Issue a pull request!
|
||||
|
54
default.gpu.env
Normal file
54
default.gpu.env
Normal file
@ -0,0 +1,54 @@
|
||||
# Discord Token
|
||||
THE_TOKEN = "DISCORD_TOKEN_HERE"
|
||||
|
||||
# The Channel IDs the bot will operate in seperated by commas
|
||||
CHANNEL_IDS = 1094494101631680653,1094628334727614605
|
||||
|
||||
# The INIT prompt for all conversations.
|
||||
INIT_PROMPT = "Assistant name: ChatAI. You code, write and provide any information without any mistakes."
|
||||
|
||||
# Loading Emebed Refresh Timing
|
||||
REFRESH_INTERVAL=10
|
||||
|
||||
# When a message is too large for discord we chunk the response into seperate messages.
|
||||
# To ensure we do not rate limit the bot we send these at a delay interval.
|
||||
# DEFAULT: 3 a good setting is between 3 and 7 seconds.
|
||||
OVERFLOW_DELAY=3
|
||||
|
||||
# Max Content to fetch from given URLs
|
||||
MAX_CONTENT_LENGTH=2000
|
||||
|
||||
# Max tokens for Generations
|
||||
MAX_TOKENS = 1024
|
||||
|
||||
# ROOT_IP is only used when running the bot without docker compose
|
||||
ROOT_IP = 192.168.0.15
|
||||
|
||||
# PORT is only used when running the bot without docker compose
|
||||
ROOT_PORT = 8000
|
||||
|
||||
# Directory to your models (llama.cpp specfic settings)
|
||||
DATA_DIR = /home/USERNAME/weights
|
||||
|
||||
# Enable Expirmental Message Caches (Limited to single session)
|
||||
# Cache will use ~1.4 GB or MORE of RAM. ONLY ENABLE IF YOUR SYSTEM CAN HANDLE THIS.
|
||||
CACHE = 1
|
||||
|
||||
# Set number of threads to use, currently, a standard thread will utilize 1 whole core
|
||||
# I usually will set this between all cores I physcally have OR 2 cores less to allow for other processes.
|
||||
N_THREADS = 4
|
||||
|
||||
# Always use MMAP unless you know what you are doing
|
||||
USE_MMAP=1
|
||||
|
||||
# Only use MLOCK if you know what it does!
|
||||
USE_MLOCK=0
|
||||
|
||||
# The higher the number the more hard core.
|
||||
REPEAT_PENALTY=1
|
||||
|
||||
# GPU SPECIFIC SETTINGS BELOW
|
||||
|
||||
GPU=1
|
||||
|
||||
N_GPU_LAYERS=32
|
34
docker-compose.gpu.yml
Normal file
34
docker-compose.gpu.yml
Normal file
@ -0,0 +1,34 @@
|
||||
version: '3.9'
|
||||
|
||||
services:
|
||||
llama-python-server:
|
||||
container_name: llama-python-gpu-server
|
||||
restart: unless-stopped
|
||||
build:
|
||||
context: ./gpu-server
|
||||
env_file: .env
|
||||
volumes:
|
||||
- ${DATA_DIR}:/usr/src/app/models
|
||||
environment:
|
||||
- HOST=llama-python-gpu-server
|
||||
- MODEL=./models/ggml-vic7b-q5_1.bin.1
|
||||
- NVIDIA_VISIBLE_DEVICES=all
|
||||
runtime: nvidia
|
||||
|
||||
llama-python-djs-bot:
|
||||
container_name: llama-python-djs-bot
|
||||
restart: unless-stopped
|
||||
build:
|
||||
context: .
|
||||
depends_on:
|
||||
- llama-python-server
|
||||
environment:
|
||||
- THE_TOKEN
|
||||
- REFRESH_INTERVAL
|
||||
- CHANNEL_IDS
|
||||
- GPU
|
||||
- ROOT_IP=llama-python-server
|
||||
- ROOT_PORT=8000
|
||||
- INIT_PROMPT='Assistant Name':' ChatAI. You code, write and provide any information without any mistakes.'
|
||||
- NVIDIA_VISIBLE_DEVICES=all
|
||||
runtime: nvidia
|
32
gpu-server/Dockerfile
Normal file
32
gpu-server/Dockerfile
Normal file
@ -0,0 +1,32 @@
|
||||
FROM nvidia/cuda:12.1.1-devel-ubuntu20.04
|
||||
|
||||
# Install the deps
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
ENV TZ=Etc/GMT
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends python3 python3-pip git cmake
|
||||
|
||||
# Get llama-cpp-python
|
||||
WORKDIR /usr/src
|
||||
|
||||
RUN git clone https://github.com/abetlen/llama-cpp-python.git
|
||||
|
||||
RUN mv llama-cpp-python app
|
||||
|
||||
WORKDIR /usr/src/app
|
||||
|
||||
#RUN git clone https://github.com/gjmulder/llama-cpp-python.git
|
||||
#RUN git checkout improved-unit-tests
|
||||
|
||||
# Patch .gitmodules to use HTTPS
|
||||
RUN sed -i 's|git@github.com:ggerganov/llama.cpp.git|https://github.com/ggerganov/llama.cpp.git|' .gitmodules
|
||||
RUN git submodule update --init --recursive
|
||||
|
||||
# Build llama-cpp-python w/CuBLAS
|
||||
RUN grep --colour "n_batch" ./llama_cpp/server/*.py
|
||||
RUN pip install scikit-build fastapi sse_starlette uvicorn && LLAMA_CUBLAS=1 python3 setup.py develop
|
||||
|
||||
# We need to set the host to 0.0.0.0 to allow outside access
|
||||
ENV HOST 0.0.0.0
|
||||
|
||||
# Run the server
|
||||
CMD python3 -m llama_cpp.server
|
76
llamabot.js
76
llamabot.js
@ -5,6 +5,8 @@ import { resetResponses, userResetMessages } from './assets/resetMessages.js';
|
||||
import { errorMessages, busyResponses } from './assets/errorMessages.js';
|
||||
import cpuStat from 'cpu-stat';
|
||||
import os from 'os';
|
||||
import smi from 'node-nvidia-smi';
|
||||
|
||||
|
||||
import {
|
||||
Client,
|
||||
@ -271,10 +273,7 @@ async function generateResponse(conversation, message) {
|
||||
const totalMemory = os.totalmem() / 1024 / 1024 / 1024;
|
||||
const usedMemory = totalMemory - freeMemory;
|
||||
|
||||
const embedData = {
|
||||
color: 0x0099ff,
|
||||
title: 'Please wait.. I am thinking...',
|
||||
fields: [
|
||||
let filedsData = [
|
||||
{
|
||||
name: 'System Load',
|
||||
value: `${systemLoad.toFixed(2)}%`,
|
||||
@ -287,7 +286,56 @@ async function generateResponse(conversation, message) {
|
||||
name: 'Time',
|
||||
value: `~${time} seconds.`,
|
||||
},
|
||||
],
|
||||
]
|
||||
|
||||
if (process.env.GPU == 1) {
|
||||
smi(function (err, data) {
|
||||
if (err) {
|
||||
// Handle error if smi function fails
|
||||
console.error(err);
|
||||
return;
|
||||
}
|
||||
|
||||
let utilization = data.nvidia_smi_log.gpu.utilization;
|
||||
let gpuUtilization = utilization.gpu_util;
|
||||
let memoryUtilization = utilization.memory_util;
|
||||
let gpuTemp = data.nvidia_smi_log.gpu.temperature.gpu_temp;
|
||||
|
||||
// These are not used until nvidia-docker fixes their support
|
||||
let gpuTarget = data.nvidia_smi_log.gpu.temperature.gpu_target_temperature;
|
||||
let gpuFanSpeed = data.nvidia_smi_log.gpu.fan_speed;
|
||||
|
||||
let filedsData = [
|
||||
{
|
||||
name: 'System Load',
|
||||
value: `${systemLoad.toFixed(2)}%`,
|
||||
},
|
||||
{
|
||||
name: 'Memory Usage',
|
||||
value: `${usedMemory.toFixed(2)} GB / ${totalMemory.toFixed(2)} GB`,
|
||||
},
|
||||
{
|
||||
name: 'GPU Utilization',
|
||||
value: `${gpuUtilization}`,
|
||||
},
|
||||
{
|
||||
name: 'Memory Utilization',
|
||||
value: `${memoryUtilization}`,
|
||||
},
|
||||
{
|
||||
name: 'GPU Temperature',
|
||||
value: `${gpuTemp}`,
|
||||
},
|
||||
{
|
||||
name: 'Time',
|
||||
value: `~${time} seconds.`,
|
||||
},
|
||||
];
|
||||
|
||||
const embedData = {
|
||||
color: 0x0099ff,
|
||||
title: 'Please wait.. I am thinking...',
|
||||
fields: filedsData,
|
||||
};
|
||||
|
||||
// if the message object doesn't exist, create it
|
||||
@ -299,6 +347,24 @@ async function generateResponse(conversation, message) {
|
||||
botMessage.edit({ embeds: [embedData] }); // otherwise, update the message
|
||||
}
|
||||
});
|
||||
} else {
|
||||
const embedData = {
|
||||
color: 0x0099ff,
|
||||
title: 'Please wait.. I am thinking...',
|
||||
fields: filedsData, // It seems like a typo, it should be `filedsData` instead of `filedsData`
|
||||
};
|
||||
|
||||
// if the message object doesn't exist, create it
|
||||
if (!botMessage) {
|
||||
(async () => {
|
||||
botMessage = await message.channel.send({ embeds: [embedData] });
|
||||
})();
|
||||
} else {
|
||||
botMessage.edit({ embeds: [embedData] }); // otherwise, update the message
|
||||
}
|
||||
}
|
||||
|
||||
});
|
||||
};
|
||||
|
||||
// call the function initially
|
||||
|
@ -15,6 +15,8 @@
|
||||
"dotenv": "^16.0.3",
|
||||
"node-fetch": "^3.3.1",
|
||||
"os": "^0.1.2",
|
||||
"cpu-stat": "^2.0.1"
|
||||
"cpu-stat": "^2.0.1",
|
||||
"node-nvidia-smi": "^1.0.0"
|
||||
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user