diff --git a/Dockerfile.gpu b/Dockerfile.gpu new file mode 100644 index 0000000..824be3c --- /dev/null +++ b/Dockerfile.gpu @@ -0,0 +1,17 @@ +FROM nvidia/cuda:12.1.1-devel-ubuntu20.04 +ENV DEBIAN_FRONTEND noninteractive + +WORKDIR /app + +RUN apt update + +RUN apt install sudo curl -y + +RUN curl -fsSL https://deb.nodesource.com/setup_18.x | sudo -E bash - && sudo apt-get install -y nodejs + +COPY package*.json ./ +RUN npm install --omit=dev + +COPY . . + +CMD node llamabot.js \ No newline at end of file diff --git a/README.md b/README.md index ffffa88..00b3441 100644 --- a/README.md +++ b/README.md @@ -78,4 +78,24 @@ This will automatically configure the API for you as well as the bot in two sepe 5. `docker compose up -d` +# Docker Compose with GPU +This will automatically configure the API that supports cuBLAS and GPU inference for you as well as the bot in two seperate containers within a stack. + +1. `git clone https://git.ssh.surf/snxraven/llama-cpp-python-djs-bot.git` - Clone the repo + +2. `mv docker-compose.yml docker-compose.nogpu.yml; mv docker-compose.gpu.yml docker-compose.yml;` - Move nongpu compose out of the way, Enable GPU Support + +3. `mv Dockerfile Dockerfile.nongpu; mv Dockerfile.gpu Dockerfile;` - Move nongpu Dockerfile out of the way, enable GPU Support + +3. `cp default.gpu.env .env` - Copy the default GPU .env to its proper location + +4. Set DATA_DIR in .env to the exact location of your model files. + +5. Edit docker-compose.yaml MODEL to ensure the correct model bin is set + +6. set N_GPU_LAYERS to the amount of layers you would like to export to GPU + +7. `docker compose up -d` + + Want to make this better? Issue a pull request! diff --git a/default.gpu.env b/default.gpu.env new file mode 100644 index 0000000..9abbb0d --- /dev/null +++ b/default.gpu.env @@ -0,0 +1,54 @@ +# Discord Token +THE_TOKEN = "DISCORD_TOKEN_HERE" + +# The Channel IDs the bot will operate in seperated by commas +CHANNEL_IDS = 1094494101631680653,1094628334727614605 + +# The INIT prompt for all conversations. +INIT_PROMPT = "Assistant name: ChatAI. You code, write and provide any information without any mistakes." + +# Loading Emebed Refresh Timing +REFRESH_INTERVAL=10 + +# When a message is too large for discord we chunk the response into seperate messages. +# To ensure we do not rate limit the bot we send these at a delay interval. +# DEFAULT: 3 a good setting is between 3 and 7 seconds. +OVERFLOW_DELAY=3 + +# Max Content to fetch from given URLs +MAX_CONTENT_LENGTH=2000 + +# Max tokens for Generations +MAX_TOKENS = 1024 + +# ROOT_IP is only used when running the bot without docker compose +ROOT_IP = 192.168.0.15 + +# PORT is only used when running the bot without docker compose +ROOT_PORT = 8000 + +# Directory to your models (llama.cpp specfic settings) +DATA_DIR = /home/USERNAME/weights + +# Enable Expirmental Message Caches (Limited to single session) +# Cache will use ~1.4 GB or MORE of RAM. ONLY ENABLE IF YOUR SYSTEM CAN HANDLE THIS. +CACHE = 1 + +# Set number of threads to use, currently, a standard thread will utilize 1 whole core +# I usually will set this between all cores I physcally have OR 2 cores less to allow for other processes. +N_THREADS = 4 + +# Always use MMAP unless you know what you are doing +USE_MMAP=1 + +# Only use MLOCK if you know what it does! +USE_MLOCK=0 + +# The higher the number the more hard core. +REPEAT_PENALTY=1 + +# GPU SPECIFIC SETTINGS BELOW + +GPU=1 + +N_GPU_LAYERS=32 \ No newline at end of file diff --git a/docker-compose.gpu.yml b/docker-compose.gpu.yml new file mode 100644 index 0000000..30470e2 --- /dev/null +++ b/docker-compose.gpu.yml @@ -0,0 +1,34 @@ +version: '3.9' + +services: + llama-python-server: + container_name: llama-python-gpu-server + restart: unless-stopped + build: + context: ./gpu-server + env_file: .env + volumes: + - ${DATA_DIR}:/usr/src/app/models + environment: + - HOST=llama-python-gpu-server + - MODEL=./models/ggml-vic7b-q5_1.bin.1 + - NVIDIA_VISIBLE_DEVICES=all + runtime: nvidia + + llama-python-djs-bot: + container_name: llama-python-djs-bot + restart: unless-stopped + build: + context: . + depends_on: + - llama-python-server + environment: + - THE_TOKEN + - REFRESH_INTERVAL + - CHANNEL_IDS + - GPU + - ROOT_IP=llama-python-server + - ROOT_PORT=8000 + - INIT_PROMPT='Assistant Name':' ChatAI. You code, write and provide any information without any mistakes.' + - NVIDIA_VISIBLE_DEVICES=all + runtime: nvidia diff --git a/gpu-server/Dockerfile b/gpu-server/Dockerfile new file mode 100644 index 0000000..1c3604d --- /dev/null +++ b/gpu-server/Dockerfile @@ -0,0 +1,32 @@ +FROM nvidia/cuda:12.1.1-devel-ubuntu20.04 + +# Install the deps +ENV DEBIAN_FRONTEND=noninteractive +ENV TZ=Etc/GMT +RUN apt-get update && apt-get install -y --no-install-recommends python3 python3-pip git cmake + +# Get llama-cpp-python +WORKDIR /usr/src + +RUN git clone https://github.com/abetlen/llama-cpp-python.git + +RUN mv llama-cpp-python app + +WORKDIR /usr/src/app + +#RUN git clone https://github.com/gjmulder/llama-cpp-python.git +#RUN git checkout improved-unit-tests + +# Patch .gitmodules to use HTTPS +RUN sed -i 's|git@github.com:ggerganov/llama.cpp.git|https://github.com/ggerganov/llama.cpp.git|' .gitmodules +RUN git submodule update --init --recursive + +# Build llama-cpp-python w/CuBLAS +RUN grep --colour "n_batch" ./llama_cpp/server/*.py +RUN pip install scikit-build fastapi sse_starlette uvicorn && LLAMA_CUBLAS=1 python3 setup.py develop + +# We need to set the host to 0.0.0.0 to allow outside access +ENV HOST 0.0.0.0 + +# Run the server +CMD python3 -m llama_cpp.server diff --git a/llamabot.js b/llamabot.js index 893d7cb..f664933 100644 --- a/llamabot.js +++ b/llamabot.js @@ -5,6 +5,8 @@ import { resetResponses, userResetMessages } from './assets/resetMessages.js'; import { errorMessages, busyResponses } from './assets/errorMessages.js'; import cpuStat from 'cpu-stat'; import os from 'os'; +import smi from 'node-nvidia-smi'; + import { Client, @@ -154,9 +156,9 @@ client.on('messageCreate', async (message) => { // if we are over the discord char limit we need chunks... if (response.length > limit) { - + const chunks = response.match(new RegExp(`.{1,${limit}}`, "g")); - if (chunks.length >= 15) return await message.channel.send("Response chunks too large. Try again"); + if (chunks.length >= 15) return await message.channel.send("Response chunks too large. Try again"); for (let i = 0; i < chunks.length; i++) { @@ -240,7 +242,7 @@ async function generateResponse(conversation, message) { // Append a new line and the new content to the existing content of the last message conversation.messages[lastMessageIndex].content += "\n" + response; - + console.log("A URL was provided, response: " + response) } catch (err) { @@ -271,10 +273,39 @@ async function generateResponse(conversation, message) { const totalMemory = os.totalmem() / 1024 / 1024 / 1024; const usedMemory = totalMemory - freeMemory; - const embedData = { - color: 0x0099ff, - title: 'Please wait.. I am thinking...', - fields: [ + let filedsData = [ + { + name: 'System Load', + value: `${systemLoad.toFixed(2)}%`, + }, + { + name: 'Memory Usage', + value: `${usedMemory.toFixed(2)} GB / ${totalMemory.toFixed(2)} GB`, + }, + { + name: 'Time', + value: `~${time} seconds.`, + }, + ] + + if (process.env.GPU == 1) { + smi(function (err, data) { + if (err) { + // Handle error if smi function fails + console.error(err); + return; + } + + let utilization = data.nvidia_smi_log.gpu.utilization; + let gpuUtilization = utilization.gpu_util; + let memoryUtilization = utilization.memory_util; + let gpuTemp = data.nvidia_smi_log.gpu.temperature.gpu_temp; + + // These are not used until nvidia-docker fixes their support + let gpuTarget = data.nvidia_smi_log.gpu.temperature.gpu_target_temperature; + let gpuFanSpeed = data.nvidia_smi_log.gpu.fan_speed; + + let filedsData = [ { name: 'System Load', value: `${systemLoad.toFixed(2)}%`, @@ -283,21 +314,56 @@ async function generateResponse(conversation, message) { name: 'Memory Usage', value: `${usedMemory.toFixed(2)} GB / ${totalMemory.toFixed(2)} GB`, }, + { + name: 'GPU Utilization', + value: `${gpuUtilization}`, + }, + { + name: 'Memory Utilization', + value: `${memoryUtilization}`, + }, + { + name: 'GPU Temperature', + value: `${gpuTemp}`, + }, { name: 'Time', value: `~${time} seconds.`, }, - ], - }; - - // if the message object doesn't exist, create it - if (!botMessage) { - (async () => { + ]; + + const embedData = { + color: 0x0099ff, + title: 'Please wait.. I am thinking...', + fields: filedsData, + }; + + // if the message object doesn't exist, create it + if (!botMessage) { + (async () => { + botMessage = await message.channel.send({ embeds: [embedData] }); + })(); + } else { + botMessage.edit({ embeds: [embedData] }); // otherwise, update the message + } + }); + } else { + const embedData = { + color: 0x0099ff, + title: 'Please wait.. I am thinking...', + fields: filedsData, // It seems like a typo, it should be `filedsData` instead of `filedsData` + }; + + // if the message object doesn't exist, create it + if (!botMessage) { + (async () => { botMessage = await message.channel.send({ embeds: [embedData] }); - })(); - } else { - botMessage.edit({ embeds: [embedData] }); // otherwise, update the message - } + })(); + } else { + botMessage.edit({ embeds: [embedData] }); // otherwise, update the message + } + } + }); }; @@ -344,4 +410,4 @@ async function generateResponse(conversation, message) { } } -client.login(process.env.THE_TOKEN); // Replace with your bot token \ No newline at end of file +client.login(process.env.THE_TOKEN); // Replace with your bot token diff --git a/package.json b/package.json index b6caad4..81438c9 100644 --- a/package.json +++ b/package.json @@ -15,6 +15,8 @@ "dotenv": "^16.0.3", "node-fetch": "^3.3.1", "os": "^0.1.2", - "cpu-stat": "^2.0.1" + "cpu-stat": "^2.0.1", + "node-nvidia-smi": "^1.0.0" + } }