From 368004f10b4282ebdae20156449030348ba15301 Mon Sep 17 00:00:00 2001
From: Raven Scott <you@example.com>
Date: Fri, 19 May 2023 21:32:21 +0200
Subject: [PATCH] adding NVIDIA GPU Support with Stats

---
 Dockerfile.gpu         |  17 +++++++
 README.md              |  20 ++++++++
 default.gpu.env        |  54 ++++++++++++++++++++++
 docker-compose.gpu.yml |  34 ++++++++++++++
 gpu-server/Dockerfile  |  32 +++++++++++++
 llamabot.js            | 102 +++++++++++++++++++++++++++++++++--------
 package.json           |   4 +-
 7 files changed, 244 insertions(+), 19 deletions(-)
 create mode 100644 Dockerfile.gpu
 create mode 100644 default.gpu.env
 create mode 100644 docker-compose.gpu.yml
 create mode 100644 gpu-server/Dockerfile

diff --git a/Dockerfile.gpu b/Dockerfile.gpu
new file mode 100644
index 0000000..824be3c
--- /dev/null
+++ b/Dockerfile.gpu
@@ -0,0 +1,17 @@
+FROM nvidia/cuda:12.1.1-devel-ubuntu20.04
+ENV DEBIAN_FRONTEND noninteractive
+
+WORKDIR /app
+
+RUN apt update
+
+RUN apt install sudo curl -y
+
+RUN curl -fsSL https://deb.nodesource.com/setup_18.x | sudo -E bash - && sudo apt-get install -y nodejs
+
+COPY package*.json ./
+RUN npm install --omit=dev
+
+COPY . .
+
+CMD node llamabot.js
\ No newline at end of file
diff --git a/README.md b/README.md
index ffffa88..00b3441 100644
--- a/README.md
+++ b/README.md
@@ -78,4 +78,24 @@ This will automatically configure the API for you as well as the bot in two sepe
 5. `docker compose up -d`
 
 
+# Docker Compose with GPU
+This will automatically configure the API that supports cuBLAS and GPU inference for you as well as the bot in two seperate containers within a stack.
+
+1. `git clone https://git.ssh.surf/snxraven/llama-cpp-python-djs-bot.git` - Clone the repo
+
+2. `mv docker-compose.yml docker-compose.nogpu.yml; mv docker-compose.gpu.yml docker-compose.yml;` - Move nongpu compose out of the way, Enable GPU Support
+
+3. `mv Dockerfile Dockerfile.nongpu; mv Dockerfile.gpu Dockerfile;` - Move nongpu Dockerfile out of the way, enable GPU Support
+
+3. `cp default.gpu.env .env` - Copy the default GPU .env to its proper location
+
+4. Set DATA_DIR in .env to the exact location of your model files.
+
+5. Edit docker-compose.yaml MODEL to ensure the correct model bin is set
+
+6. set N_GPU_LAYERS to the amount of layers you would like to export to GPU
+
+7. `docker compose up -d`
+
+
 Want to make this better? Issue a pull request!
diff --git a/default.gpu.env b/default.gpu.env
new file mode 100644
index 0000000..9abbb0d
--- /dev/null
+++ b/default.gpu.env
@@ -0,0 +1,54 @@
+# Discord Token
+THE_TOKEN = "DISCORD_TOKEN_HERE"
+
+# The Channel IDs the bot will operate in seperated by commas
+CHANNEL_IDS = 1094494101631680653,1094628334727614605
+
+# The INIT prompt for all conversations.
+INIT_PROMPT = "Assistant name: ChatAI. You code, write and provide any information without any mistakes."
+
+# Loading Emebed Refresh Timing
+REFRESH_INTERVAL=10
+
+# When a message is too large for discord we chunk the response into seperate messages.
+# To ensure we do not rate limit the bot we send these at a delay interval.
+# DEFAULT: 3 a good setting is between 3 and 7 seconds.
+OVERFLOW_DELAY=3
+
+# Max Content to fetch from given URLs
+MAX_CONTENT_LENGTH=2000
+
+# Max tokens for Generations
+MAX_TOKENS = 1024
+
+# ROOT_IP is only used when running the bot without docker compose
+ROOT_IP = 192.168.0.15
+
+# PORT  is only used when running the bot without docker compose
+ROOT_PORT = 8000
+
+# Directory to your models (llama.cpp specfic settings)
+DATA_DIR = /home/USERNAME/weights
+
+# Enable Expirmental Message Caches (Limited to single session)
+# Cache will use ~1.4 GB or MORE of RAM. ONLY ENABLE IF YOUR SYSTEM CAN HANDLE THIS.
+CACHE = 1
+
+# Set number of threads to use, currently, a standard thread will utilize 1 whole core
+# I usually will set this between all cores I physcally have OR 2 cores less to allow for other processes.
+N_THREADS = 4
+
+# Always use MMAP unless you know what you are doing
+USE_MMAP=1
+
+# Only use MLOCK if you know what it does!
+USE_MLOCK=0
+
+# The higher the number the more hard core.
+REPEAT_PENALTY=1
+
+# GPU SPECIFIC SETTINGS BELOW
+
+GPU=1
+
+N_GPU_LAYERS=32
\ No newline at end of file
diff --git a/docker-compose.gpu.yml b/docker-compose.gpu.yml
new file mode 100644
index 0000000..30470e2
--- /dev/null
+++ b/docker-compose.gpu.yml
@@ -0,0 +1,34 @@
+version: '3.9'
+
+services:
+  llama-python-server:
+    container_name: llama-python-gpu-server
+    restart: unless-stopped
+    build:
+      context: ./gpu-server
+    env_file: .env
+    volumes:
+      - ${DATA_DIR}:/usr/src/app/models
+    environment:
+      - HOST=llama-python-gpu-server
+      - MODEL=./models/ggml-vic7b-q5_1.bin.1
+      - NVIDIA_VISIBLE_DEVICES=all
+    runtime: nvidia
+    
+  llama-python-djs-bot:
+    container_name: llama-python-djs-bot
+    restart: unless-stopped
+    build:
+      context: .
+    depends_on:
+      - llama-python-server
+    environment:
+      - THE_TOKEN
+      - REFRESH_INTERVAL
+      - CHANNEL_IDS
+      - GPU
+      - ROOT_IP=llama-python-server
+      - ROOT_PORT=8000
+      - INIT_PROMPT='Assistant Name':' ChatAI. You code, write and provide any information without any mistakes.'
+      - NVIDIA_VISIBLE_DEVICES=all
+    runtime: nvidia
diff --git a/gpu-server/Dockerfile b/gpu-server/Dockerfile
new file mode 100644
index 0000000..1c3604d
--- /dev/null
+++ b/gpu-server/Dockerfile
@@ -0,0 +1,32 @@
+FROM nvidia/cuda:12.1.1-devel-ubuntu20.04
+
+# Install the deps
+ENV DEBIAN_FRONTEND=noninteractive
+ENV TZ=Etc/GMT
+RUN apt-get update && apt-get install -y --no-install-recommends python3 python3-pip git cmake
+
+# Get llama-cpp-python
+WORKDIR /usr/src
+
+RUN git clone https://github.com/abetlen/llama-cpp-python.git
+
+RUN mv llama-cpp-python app
+
+WORKDIR /usr/src/app 
+
+#RUN git clone https://github.com/gjmulder/llama-cpp-python.git
+#RUN git checkout improved-unit-tests
+
+# Patch .gitmodules to use HTTPS
+RUN sed -i 's|git@github.com:ggerganov/llama.cpp.git|https://github.com/ggerganov/llama.cpp.git|' .gitmodules
+RUN git submodule update --init --recursive
+
+# Build llama-cpp-python w/CuBLAS
+RUN grep --colour "n_batch" ./llama_cpp/server/*.py
+RUN pip install scikit-build fastapi sse_starlette uvicorn && LLAMA_CUBLAS=1 python3 setup.py develop
+
+# We need to set the host to 0.0.0.0 to allow outside access
+ENV HOST 0.0.0.0
+
+# Run the server
+CMD python3 -m llama_cpp.server
diff --git a/llamabot.js b/llamabot.js
index 893d7cb..f664933 100644
--- a/llamabot.js
+++ b/llamabot.js
@@ -5,6 +5,8 @@ import { resetResponses, userResetMessages } from './assets/resetMessages.js';
 import { errorMessages, busyResponses } from './assets/errorMessages.js';
 import cpuStat from 'cpu-stat';
 import os from 'os';
+import smi from 'node-nvidia-smi';
+
 
 import {
     Client,
@@ -154,9 +156,9 @@ client.on('messageCreate', async (message) => {
             // if we are over the discord char limit we need chunks...
             if (response.length > limit) {
 
-              
+
                 const chunks = response.match(new RegExp(`.{1,${limit}}`, "g"));
-                if (chunks.length  >= 15) return await message.channel.send("Response chunks too large. Try again");
+                if (chunks.length >= 15) return await message.channel.send("Response chunks too large. Try again");
 
 
                 for (let i = 0; i < chunks.length; i++) {
@@ -240,7 +242,7 @@ async function generateResponse(conversation, message) {
 
                 // Append a new line and the new content to the existing content of the last message
                 conversation.messages[lastMessageIndex].content += "\n" + response;
-                
+
                 console.log("A URL was provided, response: " + response)
 
             } catch (err) {
@@ -271,10 +273,39 @@ async function generateResponse(conversation, message) {
             const totalMemory = os.totalmem() / 1024 / 1024 / 1024;
             const usedMemory = totalMemory - freeMemory;
 
-            const embedData = {
-                color: 0x0099ff,
-                title: 'Please wait.. I am thinking...',
-                fields: [
+            let filedsData = [
+                {
+                    name: 'System Load',
+                    value: `${systemLoad.toFixed(2)}%`,
+                },
+                {
+                    name: 'Memory Usage',
+                    value: `${usedMemory.toFixed(2)} GB / ${totalMemory.toFixed(2)} GB`,
+                },
+                {
+                    name: 'Time',
+                    value: `~${time} seconds.`,
+                },
+            ]
+
+            if (process.env.GPU == 1) {
+                smi(function (err, data) {
+                  if (err) {
+                    // Handle error if smi function fails
+                    console.error(err);
+                    return;
+                  }
+              
+                  let utilization = data.nvidia_smi_log.gpu.utilization;
+                  let gpuUtilization = utilization.gpu_util;
+                  let memoryUtilization = utilization.memory_util;
+                  let gpuTemp = data.nvidia_smi_log.gpu.temperature.gpu_temp;
+                 
+                  // These are not used until nvidia-docker fixes their support
+                  let gpuTarget = data.nvidia_smi_log.gpu.temperature.gpu_target_temperature;
+                  let gpuFanSpeed = data.nvidia_smi_log.gpu.fan_speed;
+              
+                  let filedsData = [
                     {
                         name: 'System Load',
                         value: `${systemLoad.toFixed(2)}%`,
@@ -283,21 +314,56 @@ async function generateResponse(conversation, message) {
                         name: 'Memory Usage',
                         value: `${usedMemory.toFixed(2)} GB / ${totalMemory.toFixed(2)} GB`,
                     },
+                    {
+                      name: 'GPU Utilization',
+                      value: `${gpuUtilization}`,
+                    },
+                    {
+                      name: 'Memory Utilization',
+                      value: `${memoryUtilization}`,
+                    },
+                    {
+                      name: 'GPU Temperature',
+                      value: `${gpuTemp}`,
+                    },
                     {
                         name: 'Time',
                         value: `~${time} seconds.`,
                     },
-                ],
-            };
-
-            // if the message object doesn't exist, create it
-            if (!botMessage) {
-                (async () => {
+                  ];
+              
+                  const embedData = {
+                    color: 0x0099ff,
+                    title: 'Please wait.. I am thinking...',
+                    fields: filedsData,
+                  };
+              
+                  // if the message object doesn't exist, create it
+                  if (!botMessage) {
+                    (async () => {
+                      botMessage = await message.channel.send({ embeds: [embedData] });
+                    })();
+                  } else {
+                    botMessage.edit({ embeds: [embedData] }); // otherwise, update the message
+                  }
+                });
+              } else {
+                const embedData = {
+                  color: 0x0099ff,
+                  title: 'Please wait.. I am thinking...',
+                  fields: filedsData, // It seems like a typo, it should be `filedsData` instead of `filedsData`
+                };
+              
+                // if the message object doesn't exist, create it
+                if (!botMessage) {
+                  (async () => {
                     botMessage = await message.channel.send({ embeds: [embedData] });
-                })();
-            } else {
-                botMessage.edit({ embeds: [embedData] }); // otherwise, update the message
-            }
+                  })();
+                } else {
+                  botMessage.edit({ embeds: [embedData] }); // otherwise, update the message
+                }
+              }
+
         });
     };
 
@@ -344,4 +410,4 @@ async function generateResponse(conversation, message) {
     }
 }
 
-client.login(process.env.THE_TOKEN); // Replace with your bot token
\ No newline at end of file
+client.login(process.env.THE_TOKEN); // Replace with your bot token
diff --git a/package.json b/package.json
index b6caad4..81438c9 100644
--- a/package.json
+++ b/package.json
@@ -15,6 +15,8 @@
     "dotenv": "^16.0.3",
     "node-fetch": "^3.3.1",
     "os": "^0.1.2",
-    "cpu-stat": "^2.0.1"
+    "cpu-stat": "^2.0.1",
+    "node-nvidia-smi": "^1.0.0"
+
   }
 }