diff --git a/.github/workflows/sync_hf_space.yaml b/.github/workflows/sync_hf_space.yaml index e1c09a5..68befff 100644 --- a/.github/workflows/sync_hf_space.yaml +++ b/.github/workflows/sync_hf_space.yaml @@ -1,6 +1,9 @@ name: Sync to Hugging Face Space on: + release: + types: [published] + workflow_dispatch: jobs: @@ -10,3 +13,29 @@ jobs: - uses: actions/checkout@v4 with: fetch-depth: 0 + + - run: git clone https://${{ secrets.HF_USERNAME }}:${{ secrets.HF_TOKEN }}@huggingface.co/spaces/mozilla-ai/structured-qa hf-space + + - run: | + cp demo/app.py hf-space/app.py + cp demo/Dockerfile hf-space/Dockerfile + + - run: | + cd hf-space + git config user.name 'github-actions[bot]' + git config user.email 'github-actions[bot]@users.noreply.github.com' + git add . + git commit -m "Sync with https://github.com/mozilla-ai/structured-qa" + + - name: Push to Hugging Face + run: | + cd hf-space + git push https://${{ secrets.HF_USERNAME }}:${{ secrets.HF_TOKEN }}@huggingface.co/spaces/mozilla-ai/structured-qa main + + - name: Reboot Space + if: always() + env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + run: | + pip install huggingface_hub + python demo/reboot_space.py diff --git a/demo/Dockerfile b/demo/Dockerfile new file mode 100644 index 0000000..8316a78 --- /dev/null +++ b/demo/Dockerfile @@ -0,0 +1,26 @@ +FROM nvidia/cuda:12.2.2-cudnn8-devel-ubuntu22.04 + +RUN apt-get update && apt-get install --no-install-recommends -y \ + build-essential \ + python3.10 \ + python3.10-dev \ + python3-pip \ + git \ + && apt-get clean && rm -rf /var/lib/apt/lists/* + +RUN useradd -m -u 1000 user + +USER user + +ENV HOME=/home/user \ + PATH=/home/user/.local/bin:$PATH + +WORKDIR $HOME/app + +RUN pip3 install https://github.com/abetlen/llama-cpp-python/releases/download/v0.3.4-cu122/llama_cpp_python-0.3.4-cp310-cp310-linux_x86_64.whl +RUN pip3 install structured-qa + +COPY --chown=user . $HOME/app + +EXPOSE 8501 +ENTRYPOINT ["streamlit", "run", "app.py", "--server.enableXsrfProtection", "false"] diff --git a/demo/README.md b/demo/README.md new file mode 100644 index 0000000..31fd149 --- /dev/null +++ b/demo/README.md @@ -0,0 +1,11 @@ +--- +title: Structured Qa +emoji: 📚 +colorFrom: green +colorTo: purple +sdk: docker +app_port: 8501 +pinned: false +license: apache-2.0 +short_description: Question answering for structured documents +--- diff --git a/demo/reboot_space.py b/demo/reboot_space.py new file mode 100644 index 0000000..597095a --- /dev/null +++ b/demo/reboot_space.py @@ -0,0 +1,11 @@ +import os + +from huggingface_hub import HfApi + +if __name__ == "__main__": + api = HfApi() + api.restart_space( + repo_id="mozilla-ai/structured-qa", + token=os.getenv("HF_TOKEN"), + factory_reboot=True, + ) diff --git a/demo/run.sh b/demo/run.sh deleted file mode 100755 index bad3e42..0000000 --- a/demo/run.sh +++ /dev/null @@ -1,26 +0,0 @@ -#!/bin/bash - -# Adapted from https://docs.streamlit.io/deploy/tutorials/kubernetes - -APP_PID= -stopRunningProcess() { - # Based on https://linuxconfig.org/how-to-propagate-a-signal-to-child-processes-from-a-bash-script - if test ! "${APP_PID}" = '' && ps -p ${APP_PID} > /dev/null ; then - > /proc/1/fd/1 echo "Stopping ${COMMAND_PATH} which is running with process ID ${APP_PID}" - - kill -TERM ${APP_PID} - > /proc/1/fd/1 echo "Waiting for ${COMMAND_PATH} to process SIGTERM signal" - - wait ${APP_PID} - > /proc/1/fd/1 echo "All processes have stopped running" - else - > /proc/1/fd/1 echo "${COMMAND_PATH} was not started when the signal was sent or it has already been stopped" - fi -} - -trap stopRunningProcess EXIT TERM - -streamlit run ${HOME}/document-to-podcast/demo/app.py & -APP_ID=${!} - -wait ${APP_ID} diff --git a/src/structured_qa/model_loaders.py b/src/structured_qa/model_loaders.py index 62fbc05..8a6d587 100644 --- a/src/structured_qa/model_loaders.py +++ b/src/structured_qa/model_loaders.py @@ -1,7 +1,15 @@ -import torch +import subprocess from llama_cpp import Llama +def gpu_available(): + try: + subprocess.check_output("nvidia-smi") + return True + except Exception: + return False + + def load_llama_cpp_model(model_id: str) -> Llama: """ Loads the given model_id using Llama.from_pretrained. @@ -22,6 +30,6 @@ def load_llama_cpp_model(model_id: str) -> Llama: filename=filename, n_ctx=0, # 0 means that the model limit will be used, instead of the default (512) or other hardcoded value verbose=False, - n_gpu_layers=-1 if torch.cuda.is_available() else 0, + n_gpu_layers=-1 if gpu_available() else 0, ) return model