adding NVIDIA GPU Support with Stats

2023-05-19 21:32:21 +02:00
parent 4b090592ad
commit 368004f10b
7 changed files with 244 additions and 19 deletions
--- a/Dockerfile.gpu
+++ b/Dockerfile.gpu
@@ -0,0 +1,17 @@
 FROM nvidia/cuda:12.1.1-devel-ubuntu20.04
 ENV DEBIAN_FRONTEND noninteractive
 WORKDIR /app
 RUN apt update
 RUN apt install sudo curl -y
 RUN curl -fsSL https://deb.nodesource.com/setup_18.x | sudo -E bash - && sudo apt-get install -y nodejs
 COPY package*.json ./
 RUN npm install --omit=dev
 COPY . .
 CMD node llamabot.js
--- a/README.md
+++ b/README.md
@@ -78,4 +78,24 @@ This will automatically configure the API for you as well as the bot in two sepe
 5. `docker compose up -d`
 # Docker Compose with GPU
 This will automatically configure the API that supports cuBLAS and GPU inference for you as well as the bot in two seperate containers within a stack.
 1. `git clone https://git.ssh.surf/snxraven/llama-cpp-python-djs-bot.git` - Clone the repo
 2. `mv docker-compose.yml docker-compose.nogpu.yml; mv docker-compose.gpu.yml docker-compose.yml;` - Move nongpu compose out of the way, Enable GPU Support
 3. `mv Dockerfile Dockerfile.nongpu; mv Dockerfile.gpu Dockerfile;` - Move nongpu Dockerfile out of the way, enable GPU Support
 3. `cp default.gpu.env .env` - Copy the default GPU .env to its proper location
 4. Set DATA_DIR in .env to the exact location of your model files.
 5. Edit docker-compose.yaml MODEL to ensure the correct model bin is set
 6. set N_GPU_LAYERS to the amount of layers you would like to export to GPU
 7. `docker compose up -d`
 Want to make this better? Issue a pull request!
--- a/default.gpu.env
+++ b/default.gpu.env
@@ -0,0 +1,54 @@
 # Discord Token
 THE_TOKEN = "DISCORD_TOKEN_HERE"
 # The Channel IDs the bot will operate in seperated by commas
 CHANNEL_IDS = 1094494101631680653,1094628334727614605
 # The INIT prompt for all conversations.
 INIT_PROMPT = "Assistant name: ChatAI. You code, write and provide any information without any mistakes."
 # Loading Emebed Refresh Timing
 REFRESH_INTERVAL=10
 # When a message is too large for discord we chunk the response into seperate messages.
 # To ensure we do not rate limit the bot we send these at a delay interval.
 # DEFAULT: 3 a good setting is between 3 and 7 seconds.
 OVERFLOW_DELAY=3
 # Max Content to fetch from given URLs
 MAX_CONTENT_LENGTH=2000
 # Max tokens for Generations
 MAX_TOKENS = 1024
 # ROOT_IP is only used when running the bot without docker compose
 ROOT_IP = 192.168.0.15
 # PORT  is only used when running the bot without docker compose
 ROOT_PORT = 8000
 # Directory to your models (llama.cpp specfic settings)
 DATA_DIR = /home/USERNAME/weights
 # Enable Expirmental Message Caches (Limited to single session)
 # Cache will use ~1.4 GB or MORE of RAM. ONLY ENABLE IF YOUR SYSTEM CAN HANDLE THIS.
 CACHE = 1
 # Set number of threads to use, currently, a standard thread will utilize 1 whole core
 # I usually will set this between all cores I physcally have OR 2 cores less to allow for other processes.
 N_THREADS = 4
 # Always use MMAP unless you know what you are doing
 USE_MMAP=1
 # Only use MLOCK if you know what it does!
 USE_MLOCK=0
 # The higher the number the more hard core.
 REPEAT_PENALTY=1
 # GPU SPECIFIC SETTINGS BELOW
 GPU=1
 N_GPU_LAYERS=32
--- a/docker-compose.gpu.yml
+++ b/docker-compose.gpu.yml
@@ -0,0 +1,34 @@
 version: '3.9'
 services:
  llama-python-server:
    container_name: llama-python-gpu-server
    restart: unless-stopped
    build:
      context: ./gpu-server
    env_file: .env
    volumes:
      - ${DATA_DIR}:/usr/src/app/models
    environment:
      - HOST=llama-python-gpu-server
      - MODEL=./models/ggml-vic7b-q5_1.bin.1
      - NVIDIA_VISIBLE_DEVICES=all
    runtime: nvidia
  llama-python-djs-bot:
    container_name: llama-python-djs-bot
    restart: unless-stopped
    build:
      context: .
    depends_on:
      - llama-python-server
    environment:
      - THE_TOKEN
      - REFRESH_INTERVAL
      - CHANNEL_IDS
      - GPU
      - ROOT_IP=llama-python-server
      - ROOT_PORT=8000
      - INIT_PROMPT='Assistant Name':' ChatAI. You code, write and provide any information without any mistakes.'
      - NVIDIA_VISIBLE_DEVICES=all
    runtime: nvidia
--- a/gpu-server/Dockerfile
+++ b/gpu-server/Dockerfile
@@ -0,0 +1,32 @@
 FROM nvidia/cuda:12.1.1-devel-ubuntu20.04
 # Install the deps
 ENV DEBIAN_FRONTEND=noninteractive
 ENV TZ=Etc/GMT
 RUN apt-get update && apt-get install -y --no-install-recommends python3 python3-pip git cmake
 # Get llama-cpp-python
 WORKDIR /usr/src
 RUN git clone https://github.com/abetlen/llama-cpp-python.git
 RUN mv llama-cpp-python app
 WORKDIR /usr/src/app 
 #RUN git clone https://github.com/gjmulder/llama-cpp-python.git
 #RUN git checkout improved-unit-tests
 # Patch .gitmodules to use HTTPS
 RUN sed -i 's|git@github.com:ggerganov/llama.cpp.git|https://github.com/ggerganov/llama.cpp.git|' .gitmodules
 RUN git submodule update --init --recursive
 # Build llama-cpp-python w/CuBLAS
 RUN grep --colour "n_batch" ./llama_cpp/server/*.py
 RUN pip install scikit-build fastapi sse_starlette uvicorn && LLAMA_CUBLAS=1 python3 setup.py develop
 # We need to set the host to 0.0.0.0 to allow outside access
 ENV HOST 0.0.0.0
 # Run the server
 CMD python3 -m llama_cpp.server
--- a/llamabot.js
+++ b/llamabot.js
@@ -5,6 +5,8 @@ import { resetResponses, userResetMessages } from './assets/resetMessages.js';
 import { errorMessages, busyResponses } from './assets/errorMessages.js';
 import cpuStat from 'cpu-stat';
 import os from 'os';
 import smi from 'node-nvidia-smi';
 import {
    Client,
@@ -271,10 +273,7 @@ async function generateResponse(conversation, message) {
            const totalMemory = os.totalmem() / 1024 / 1024 / 1024;
            const usedMemory = totalMemory - freeMemory;
-            const embedData = {
+            let filedsData = [
                color: 0x0099ff,
                title: 'Please wait.. I am thinking...',
                fields: [
                {
                    name: 'System Load',
                    value: `${systemLoad.toFixed(2)}%`,
@@ -287,7 +286,56 @@ async function generateResponse(conversation, message) {
                    name: 'Time',
                    value: `~${time} seconds.`,
                },
-                ],
+            ]
            if (process.env.GPU == 1) {
                smi(function (err, data) {
                  if (err) {
                    // Handle error if smi function fails
                    console.error(err);
                    return;
                  }
                  let utilization = data.nvidia_smi_log.gpu.utilization;
                  let gpuUtilization = utilization.gpu_util;
                  let memoryUtilization = utilization.memory_util;
                  let gpuTemp = data.nvidia_smi_log.gpu.temperature.gpu_temp;
                  // These are not used until nvidia-docker fixes their support
                  let gpuTarget = data.nvidia_smi_log.gpu.temperature.gpu_target_temperature;
                  let gpuFanSpeed = data.nvidia_smi_log.gpu.fan_speed;
                  let filedsData = [
                    {
                        name: 'System Load',
                        value: `${systemLoad.toFixed(2)}%`,
                    },
                    {
                        name: 'Memory Usage',
                        value: `${usedMemory.toFixed(2)} GB / ${totalMemory.toFixed(2)} GB`,
                    },
                    {
                      name: 'GPU Utilization',
                      value: `${gpuUtilization}`,
                    },
                    {
                      name: 'Memory Utilization',
                      value: `${memoryUtilization}`,
                    },
                    {
                      name: 'GPU Temperature',
                      value: `${gpuTemp}`,
                    },
                    {
                        name: 'Time',
                        value: `~${time} seconds.`,
                    },
                  ];
                  const embedData = {
                    color: 0x0099ff,
                    title: 'Please wait.. I am thinking...',
                    fields: filedsData,
                  };
                  // if the message object doesn't exist, create it
@@ -299,6 +347,24 @@ async function generateResponse(conversation, message) {
                    botMessage.edit({ embeds: [embedData] }); // otherwise, update the message
                  }
                });
              } else {
                const embedData = {
                  color: 0x0099ff,
                  title: 'Please wait.. I am thinking...',
                  fields: filedsData, // It seems like a typo, it should be `filedsData` instead of `filedsData`
                };
                // if the message object doesn't exist, create it
                if (!botMessage) {
                  (async () => {
                    botMessage = await message.channel.send({ embeds: [embedData] });
                  })();
                } else {
                  botMessage.edit({ embeds: [embedData] }); // otherwise, update the message
                }
              }
        });
    };
    // call the function initially
--- a/package.json
+++ b/package.json
@@ -15,6 +15,8 @@
    "dotenv": "^16.0.3",
    "node-fetch": "^3.3.1",
    "os": "^0.1.2",
-    "cpu-stat": "^2.0.1"
+    "cpu-stat": "^2.0.1",
    "node-nvidia-smi": "^1.0.0"
  }
 }