INIT_PROMPT="You are an assitant" # Max tokens for Generations MAX_TOKENS = 4000 # Directory to your models (llama.cpp specfic settings) DATA_DIR = /models # Enable Expirmental Message Caches (Limited to single session) # Cache will use ~1.4 GB or MORE of RAM. ONLY ENABLE IF YOUR SYSTEM CAN HANDLE THIS. CACHE = 1 CACHE_TYPE = "ram" # Set number of threads to use, currently, a standard thread will utilize 1 whole core # I usually will set this between all cores I physcally have OR 2 cores less to allow for other processes. N_THREADS = 8 # Always use MMAP unless you know what you are doing USE_MMAP=1 # Only use MLOCK if you know what it does! USE_MLOCK=0 # The higher the number the more hard core. REPEAT_PENALTY=1 # GPU SPECIFIC SETTINGS BELOW GPU=1 N_GPU_LAYERS=35 PYTHONUNBUFFERED=1