From b0b2a136b5809881622f7f3d5fa8768f064872c8 Mon Sep 17 00:00:00 2001 From: Adam Bouhenguel Date: Thu, 30 Nov 2023 11:24:01 -0500 Subject: [PATCH] Support llamafile images (with CUDA support) --- .github/workflows/push.yml | 10 ++++++ Dockerfile | 71 ++++++++++++++++++++++++++++++++++---- README.md | 2 ++ docker-compose.yml | 36 +++++++++++++++++++ 4 files changed, 113 insertions(+), 6 deletions(-) diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml index 3211e6a..2814ca2 100644 --- a/.github/workflows/push.yml +++ b/.github/workflows/push.yml @@ -29,6 +29,16 @@ jobs: registry: ${{ env.REGISTRY }} username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} + - name: Maximize available disk space + # From https://github.com/actions/runner-images/issues/2840#issuecomment-790492173 + shell: bash + run: | + df -h + sudo rm -rf /usr/share/dotnet + sudo rm -rf /opt/ghc + sudo rm -rf "/usr/local/share/boost" + sudo rm -rf "$AGENT_TOOLSDIRECTORY" + df -h - name: Build and push containers shell: bash run: | diff --git a/Dockerfile b/Dockerfile index 6b2ccf7..8b35ed6 100644 --- a/Dockerfile +++ b/Dockerfile @@ -21,7 +21,8 @@ ADD --checksum=sha256:241dc90f3e92b22c9e08cfb5f6df2e920da258e3c461d9677f267ab7a WORKDIR /opt/cosmos RUN unzip /dl/cosmos.zip WORKDIR /opt/cosmos/bin -RUN /usr/bin/assimilate-x86_64.elf -c dd \ +RUN /usr/bin/assimilate-x86_64.elf -c assimilate \ + && /usr/bin/assimilate-x86_64.elf -c dd \ && /usr/bin/assimilate-x86_64.elf -c cp \ && /usr/bin/assimilate-x86_64.elf -c mv \ && /usr/bin/assimilate-x86_64.elf -c echo \ @@ -98,8 +99,66 @@ CMD ["/bin/bash"] # COPY --from=unpack-cosmos /usr/bin/ /usr/bin/ # CMD /bin/bash -# FROM cosmos-scratch as mistral-7b-instruct-v0.1-Q4_K_M-main -# LABEL org.opencontainers.image.source https://github.com/ajbouh/cosmos -# COPY --chmod=0755 mistral-7b-instruct-v0.1-Q4_K_M-main.llamafile /usr/bin/mistral-7b-instruct-v0.1-Q4_K_M-main.llamafile -# ENV PATH=/bin:/usr/bin -# ENTRYPOINT ["/bin/sh", "-c", "exec \"$@\"", "sh", "/usr/bin/mistral-7b-instruct-v0.1-Q4_K_M-main.llamafile"] +FROM cosmos-scratch as llamafile +LABEL org.opencontainers.image.source https://github.com/ajbouh/cosmos +ARG LLAMAFILE_URL +ARG LLAMAFILE_CHECKSUM +ADD --checksum=${LLAMAFILE_CHECKSUM} --chmod=0755 ${LLAMAFILE_URL} /usr/bin/llamafile +ENTRYPOINT ["/bin/sh", "-c", "exec \"$@\"", "sh", "/usr/bin/llamafile"] + +FROM cosmos-scratch as llamafile-gguf +LABEL org.opencontainers.image.source https://github.com/ajbouh/cosmos +ADD --checksum=sha256:dc538ce8721bb84ad3a9f683757ce7a227e61bf2c6e092c4014838fe198c41cc --chmod=0755 https://github.com/Mozilla-Ocho/llamafile/releases/download/0.1/llamafile-main-0.1 /usr/bin/llamafile-main +ARG GGUF_URL +ARG GGUF_CHECKSUM +ADD --checksum=${GGUF_CHECKSUM} --chmod=0755 ${GGUF_URL} /model.gguf +ENTRYPOINT ["/bin/sh", "-c", "exec \"$@\"", "sh", "/usr/bin/llamafile-main", "-m", "/model.gguf"] + +FROM nvidia/cuda:12.1.1-devel-ubuntu22.04 as devel-llamafile +ADD --checksum=sha256:dc538ce8721bb84ad3a9f683757ce7a227e61bf2c6e092c4014838fe198c41cc --chmod=0755 https://github.com/Mozilla-Ocho/llamafile/releases/download/0.1/llamafile-main-0.1 /usr/bin/llamafile-main +# HACK we need to assimilate so this can run on github actions... +COPY --from=unpack-cosmos /usr/bin/assimilate /usr/bin/ +RUN /usr/bin/assimilate -c /usr/bin/llamafile-main +# HACK get llamafile to build stubs we can use at runtime. would be better to use a "only compile stubs" entrypoint +RUN (/usr/bin/llamafile-main -m /dev/null --n-gpu-layers 1 || true) \ + && [ -e /root/.cosmo ] && [ -e /root/.llamafile ] + +FROM cosmos-scratch as llamafile-cuda-scratch +LABEL org.opencontainers.image.source https://github.com/ajbouh/cosmos +COPY --from=devel-llamafile /usr/local/cuda/targets/x86_64-linux/lib/libcublas.so.12 /usr/local/cuda/targets/x86_64-linux/lib/libcublasLt.so.12 /usr/local/cuda/targets/x86_64-linux/lib/ +COPY --from=devel-llamafile /lib64/ld-linux-x86-64.so.2 /lib64/ld-linux-x86-64.so.2 +COPY --from=devel-llamafile /lib/x86_64-linux-gnu/libstdc++.so.6 /lib/x86_64-linux-gnu/libm.so.6 /lib/x86_64-linux-gnu/libgcc_s.so.1 /lib/x86_64-linux-gnu/libc.so.6 /lib/x86_64-linux-gnu/librt.so.1 /lib/x86_64-linux-gnu/libpthread.so.0 /lib/x86_64-linux-gnu/libdl.so.2 /lib/x86_64-linux-gnu/ +WORKDIR /root +COPY --from=devel-llamafile /root/.cosmo /root/.cosmo +COPY --from=devel-llamafile /root/.llamafile /root/.llamafile +ENV PATH=/bin:/usr/bin +ENV HOME=/root +ENV LD_LIBRARY_PATH=/usr/local/cuda/targets/x86_64-linux/lib:/lib:/lib64 +# HACK forge an executable nvcc, because llamafile needs to find nvcc before looking for cached .cosmo and .llamafile files +COPY --from=unpack-cosmos /bin/chmod /bin/ +WORKDIR /usr/local/cuda/bin/ +RUN printf "" >nvcc +RUN chmod 0755 nvcc +# HACK things seem to fail if we have multiple CUDA devices. limit ourselves to one device for now to avoid errors like: +# > CUDA error 2 at /root/.llamafile/ggml-cuda.cu:7864: out of memory +# > current device: 4 +ENV CUDA_VISIBLE_DEVICES=0 + +FROM llamafile-cuda-scratch as llamafile-cuda +LABEL org.opencontainers.image.source https://github.com/ajbouh/cosmos +ARG LLAMAFILE_URL +ARG LLAMAFILE_CHECKSUM +ADD --checksum=${LLAMAFILE_CHECKSUM} --chmod=0755 ${LLAMAFILE_URL} /usr/bin/llamafile +ARG LLAMAFILE_N_GPU_LAYERS=35 +ENV LLAMAFILE_N_GPU_LAYERS=${LLAMAFILE_N_GPU_LAYERS} +ENTRYPOINT ["/bin/sh", "-c", "exec \"$@\" --n-gpu-layers $LLAMAFILE_N_GPU_LAYERS", "sh", "/usr/bin/llamafile"] + +FROM llamafile-cuda-scratch as llamafile-gguf-cuda +LABEL org.opencontainers.image.source https://github.com/ajbouh/cosmos +ADD --checksum=sha256:dc538ce8721bb84ad3a9f683757ce7a227e61bf2c6e092c4014838fe198c41cc --chmod=0755 https://github.com/Mozilla-Ocho/llamafile/releases/download/0.1/llamafile-main-0.1 /usr/bin/llamafile-main +ARG GGUF_URL +ARG GGUF_CHECKSUM +ADD --checksum=${GGUF_CHECKSUM} --chmod=0755 ${GGUF_URL} /model.gguf +ARG LLAMAFILE_N_GPU_LAYERS=35 +ENV LLAMAFILE_N_GPU_LAYERS=${LLAMAFILE_N_GPU_LAYERS} +ENTRYPOINT ["/bin/sh", "-c", "exec \"$@\" --n-gpu-layers $LLAMAFILE_N_GPU_LAYERS", "sh", "/usr/bin/llamafile-main", "-m", "/model.gguf"] diff --git a/README.md b/README.md index 13e531e..ba13334 100644 --- a/README.md +++ b/README.md @@ -8,4 +8,6 @@ docker compose run --build --rm -it python docker compose run --build --rm -it lua docker compose run --build --rm -it sqlite3 docker compose run --build --rm -it qjs +docker compose run --build --rm -it mistral-7b-instruct-v0.1-q4_k_m-cuda +docker compose run --build --rm -it mistral-7b-instruct-v0.1-q4_k_m ``` diff --git a/docker-compose.yml b/docker-compose.yml index 4e80688..9164113 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -33,3 +33,39 @@ services: target: ape args: COSMOS_EXE: /usr/bin/qjs + mistral-7b-instruct-v0.1-q4_k_m-cuda: + image: ghcr.io/ajbouh/cosmos:mistral-7b-instruct-v0.1-q4_k_m-cuda-12.1.1-cosmo-3.1.1 + deploy: {resources: {reservations: {devices: [{driver: nvidia, count: all, capabilities: ["gpu"]}]}}} + build: + dockerfile: Dockerfile + target: llamafile-cuda + args: + LLAMAFILE_URL: https://huggingface.co/jartine/mistral-7b.llamafile/resolve/main/mistral-7b-instruct-v0.1-Q4_K_M-main.llamafile?download=true + LLAMAFILE_CHECKSUM: sha256:c8d34c244e01a91df1e8b22196dfddb9662f6b08fbcd4a23609d7b736b56f4ae + LLAMAFILE_N_GPU_LAYERS: 35 + mistral-7b-instruct-v0.1-q4_k_m: + image: ghcr.io/ajbouh/cosmos:mistral-7b-instruct-v0.1-q4_k_m-cosmo-3.1.1 + build: + dockerfile: Dockerfile + target: llamafile + args: + LLAMAFILE_URL: https://huggingface.co/jartine/mistral-7b.llamafile/resolve/main/mistral-7b-instruct-v0.1-Q4_K_M-main.llamafile?download=true + LLAMAFILE_CHECKSUM: sha256:c8d34c244e01a91df1e8b22196dfddb9662f6b08fbcd4a23609d7b736b56f4ae + llava-v1.5-7b-q4_k-cuda: + image: ghcr.io/ajbouh/cosmos:llava-v1.5-7b-q4_k-cuda-12.1.1-cosmo-3.1.1 + deploy: {resources: {reservations: {devices: [{driver: nvidia, count: all, capabilities: ["gpu"]}]}}} + build: + dockerfile: Dockerfile + target: llamafile-gguf-cuda + args: + GGUF_URL: https://huggingface.co/jartine/llava-v1.5-7B-GGUF/resolve/main/llava-v1.5-7b-Q4_K.gguf?download=true + GGUF_CHECKSUM: sha256:c91ebf0a628ceb25e374df23ad966cc1bf1514b33fecf4f0073f9619dec5b3f9 + LLAMAFILE_N_GPU_LAYERS: 35 + llava-v1.5-7b-q4_k: + image: ghcr.io/ajbouh/cosmos:llava-v1.5-7b-q4_k-cosmo-3.1.1 + build: + dockerfile: Dockerfile + target: llamafile-gguf + args: + GGUF_URL: https://huggingface.co/jartine/llava-v1.5-7B-GGUF/resolve/main/llava-v1.5-7b-Q4_K.gguf?download=true + GGUF_CHECKSUM: sha256:c91ebf0a628ceb25e374df23ad966cc1bf1514b33fecf4f0073f9619dec5b3f9