Skip to content

Commit

Permalink
add --keep-groups and --ngl options
Browse files Browse the repository at this point in the history
Signed-off-by: Alexey Korepanov <[email protected]>
  • Loading branch information
khumarahn committed Feb 1, 2025
1 parent 4851d60 commit c9c777d
Show file tree
Hide file tree
Showing 5 changed files with 49 additions and 7 deletions.
7 changes: 7 additions & 0 deletions docs/ramalama.1.md
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,13 @@ The default can be overridden in the ramalama.conf file or via the the
RAMALAMA_IMAGE environment variable. `export RAMALAMA_TRANSPORT=quay.io/ramalama/aiimage:latest` tells
RamaLama to use the `quay.io/ramalama/aiimage:latest` image.

#### **--keep-groups**
pass --group-add keep-groups to podman (default: False)
Needed to access the gpu on some systems, but has an impact on security, use with caution.

#### **--ngl**
number of gpu layers (default: 999)

#### **--nocontainer**
do not run RamaLama in the default container (default: False)
The default can be overridden in the ramalama.conf file.
Expand Down
9 changes: 9 additions & 0 deletions docs/ramalama.conf
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,15 @@
#
#host = "0.0.0.0"

# Pass `--group-add keep-groups` to podman, when using podman.
# In some cases this is needed to access the gpu from a rootless container
#
#keep_groups = false

# Default number of layers offloaded to the gpu
#
#ngl = 999

# Specify default port for services to listen on
#
#port = "8080"
Expand Down
9 changes: 9 additions & 0 deletions docs/ramalama.conf.5.md
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,15 @@ IP address for llama.cpp to listen on.
OCI container image to run with the specified AI model
RAMALAMA_IMAGE environment variable overrides this field.

**keep_groups**=false

Pass `--group-add keep-groups` to podman, when using podman.
In some cases this is needed to access the gpu from a rootless container

**ngl**=999

Default number of layers to offload to the gpu

**port**="8080"

Specify default port for services to listen on
Expand Down
15 changes: 15 additions & 0 deletions ramalama/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,21 @@ def configure_arguments(parser):
action="store_true",
help="offload the workload to the GPU",
)
parser.add_argument(
"--ngl",
dest="ngl",
type=int,
default=config.get("ngl", 999),
help="Number of layers to offload to the gpu, if available"
)
parser.add_argument(
"--keep-groups",
dest="podman_keep_groups",
default=config.get("keep_groups", False),
action="store_true",
help="""pass `--group-add keep-groups` to podman, if using podman.
Needed to access gpu on some systems, but has security implications.""",
)
parser.add_argument(
"--image",
default=config.get("image"),
Expand Down
16 changes: 9 additions & 7 deletions ramalama/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,8 @@ def setup_container(self, args):

if os.path.basename(args.engine) == "podman":
conman_args += ["--pull=newer"]
if args.podman_keep_groups:
conman_args += ["--group-add", "keep-groups"]
elif os.path.basename(args.engine) == "docker":
try:
run_cmd([args.engine, "pull", "-q", args.image], ignore_all=True)
Expand Down Expand Up @@ -188,10 +190,10 @@ def setup_container(self, args):
conman_args += ["-e", f"{k}={v}"]
return conman_args

def gpu_args(self, force=False, runner=False):
def gpu_args(self, args, runner=False):
gpu_args = []
if (
force
args.gpu
or os.getenv("HIP_VISIBLE_DEVICES")
or os.getenv("ASAHI_VISIBLE_DEVICES")
or os.getenv("CUDA_VISIBLE_DEVICES")
Expand All @@ -206,7 +208,7 @@ def gpu_args(self, force=False, runner=False):
else:
gpu_args += ["-ngl"] # single dash

gpu_args += ["999"]
gpu_args += [ f'{args.ngl}' ]

return gpu_args

Expand Down Expand Up @@ -256,7 +258,7 @@ def build_exec_args_perplexity(self, args, model_path):
exec_args = ["llama-perplexity"]

get_gpu()
gpu_args = self.gpu_args(force=args.gpu)
gpu_args = self.gpu_args(args=args)
if gpu_args is not None:
exec_args.extend(gpu_args)

Expand Down Expand Up @@ -295,7 +297,7 @@ def build_exec_args_bench(self, args, model_path):
exec_args = ["llama-bench"]

get_gpu()
gpu_args = self.gpu_args(force=args.gpu)
gpu_args = self.gpu_args(args=args)
if gpu_args is not None:
exec_args.extend(gpu_args)

Expand All @@ -314,7 +316,7 @@ def build_exec_args_run(self, args, model_path, prompt):
exec_args += ["-v"]

get_gpu()
gpu_args = self.gpu_args(force=args.gpu, runner=True)
gpu_args = self.gpu_args(args=args, runner=True)
if gpu_args is not None:
exec_args.extend(gpu_args)

Expand Down Expand Up @@ -379,7 +381,7 @@ def handle_runtime(self, args, exec_args, exec_model_path):
exec_args = ["--port", args.port, "--model", MNT_FILE, "--max_model_len", "2048"]
else:
get_gpu()
gpu_args = self.gpu_args(force=args.gpu)
gpu_args = self.gpu_args(args=args)
if gpu_args is not None:
exec_args.extend(gpu_args)
exec_args.extend(["--host", args.host])
Expand Down

0 comments on commit c9c777d

Please sign in to comment.