Updates for issues in #1789

pseudotensor · pseudotensor · commit def1c183a51d · 2024-08-08T12:41:56.000-07:00
diff --git a/docker_build_script_ubuntu.sh b/docker_build_script_ubuntu.sh
@@ -41,8 +41,8 @@ wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
 
 # if building for CPU, would remove CMAKE_ARGS and avoid GPU image as base image
 # Choose llama_cpp_python ARGS for your system according to [llama_cpp_python backend documentation](https://github.com/abetlen/llama-cpp-python?tab=readme-ov-file#supported-backends), e.g. for CUDA:
-export LLAMA_CUBLAS=1
-export CMAKE_ARGS="-DLLAMA_CUBLAS=on -DCMAKE_CUDA_ARCHITECTURES=all"
+export GGML_CUDA=1
+export CMAKE_ARGS="-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=all"
 # for Metal MAC M1/M2 comment out above two lines and uncomment out the below line
 # export CMAKE_ARGS="-DLLAMA_METAL=on"
 export FORCE_CMAKE=1
diff --git a/docs/FAQ.md b/docs/FAQ.md
@@ -2538,7 +2538,7 @@ on CPU, or for GPU:
 git clone https://github.com/ggerganov/llama.cpp
 cd llama.cpp
 make clean
-make LLAMA_CUBLAS=1
+make GGML_CUDA=1
 ```
 etc. following different [scenarios](https://github.com/ggerganov/llama.cpp#build).
 
@@ -2928,8 +2928,8 @@ Other workarounds:
 * Workaround 2: Follow normal directions for installation, but replace 0.2.76 with 0.2.26, e.g. for CUDA with Linux:
     ```bash
     pip uninstall llama_cpp_python llama_cpp_python_cuda -y
-    export LLAMA_CUBLAS=1
-    export CMAKE_ARGS="-DLLAMA_CUBLAS=on -DCMAKE_CUDA_ARCHITECTURES=all"
+    export GGML_CUDA=1
+    export CMAKE_ARGS="-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=all"
     export FORCE_CMAKE=1
     pip install llama_cpp_python==0.2.26 --no-cache-dir
     ```
diff --git a/docs/README_DOCKER.md b/docs/README_DOCKER.md
@@ -74,7 +74,7 @@ For example, for Metal M1/M2 support of llama.cpp GGUF files, one should change
 ```bash
 export CMAKE_ARGS="-DLLAMA_METAL=on"
 ```
-and remove `LLAMA_CUBLAS=1`, so that the docker image is Metal Compatible for llama.cpp GGUF files.  Otherwise, Torch supports Metal M1/M2 directly without changes.
+and remove `GGML_CUDA=1`, so that the docker image is Metal Compatible for llama.cpp GGUF files.  Otherwise, Torch supports Metal M1/M2 directly without changes.
 
 ### Build
 
diff --git a/docs/README_LINUX.md b/docs/README_LINUX.md
@@ -110,8 +110,8 @@ sudo sh cuda_12.1.1_530.30.02_linux.run
 
 * Choose llama_cpp_python ARGS for your system according to [llama_cpp_python backend documentation](https://github.com/abetlen/llama-cpp-python?tab=readme-ov-file#supported-backends), e.g. for CUDA:
    ```bash
-   export LLAMA_CUBLAS=1
-   export CMAKE_ARGS="-DLLAMA_CUBLAS=on -DCMAKE_CUDA_ARCHITECTURES=all"
+   export GGML_CUDA=1
+   export CMAKE_ARGS="-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=all"
    export FORCE_CMAKE=1
    ```
   Note for some reason things will fail with llama_cpp_python if don't add all cuda arches, and building with all those arches does take some time.
diff --git a/docs/README_WHEEL.md b/docs/README_WHEEL.md
@@ -13,8 +13,8 @@ Install in fresh env, avoiding being inside h2ogpt directory or a directory wher
 ```bash
 export CUDA_HOME=/usr/local/cuda-12.1
 export PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cu121 https://huggingface.github.io/autogptq-index/whl/cu121"
-set CMAKE_ARGS=-DLLAMA_CUBLAS=on -DCMAKE_CUDA_ARCHITECTURES=all
-set LLAMA_CUBLAS=1
+set CMAKE_ARGS=-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=all
+set GGML_CUDA=1
 set FORCE_CMAKE=1
 ```
 for the cmake args, choose e llama_cpp_python ARGS for your system according to [llama_cpp_python backend documentation](https://github.com/abetlen/llama-cpp-python?tab=readme-ov-file#supported-backends).  Note for some reason things will fail with llama_cpp_python if don't add all cuda arches, and building with all those arches does take some time.
@@ -37,7 +37,7 @@ conda install weasyprint pygobject -c conda-forge -y
 ```
 second run:
 ```bash
-export CMAKE_ARGS="-DLLAMA_CUBLAS=on -DCMAKE_CUDA_ARCHITECTURES=all"
+export CMAKE_ARGS="-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=all"
 export CUDA_HOME=/usr/local/cuda-12.1
 export PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cu121 https://huggingface.github.io/autogptq-index/whl/cu121"
 pip install h2ogpt==0.2.0[cuda] --index-url https://downloads.h2ogpt.h2o.ai --extra-index-url https://pypi.org/simple --no-cache
@@ -65,7 +65,7 @@ which can be installed with basic CUDA support like:
 ```bash
 # For other GPUs etc. see: https://github.com/abetlen/llama-cpp-python?tab=readme-ov-file#supported-backends
 # required for PyPi wheels that do not allow URLs, so uses generic llama_cpp_python package:
-export CMAKE_ARGS="-DLLAMA_CUBLAS=on -DCMAKE_CUDA_ARCHITECTURES=all"
+export CMAKE_ARGS="-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=all"
 export CUDA_HOME=/usr/local/cuda-12.1
 export PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cu121 https://huggingface.github.io/autogptq-index/whl/cu121"
 # below [cuda] assumes CUDA 12.1 for some packages like AutoAWQ etc.
diff --git a/docs/README_WINDOWS.md b/docs/README_WINDOWS.md
@@ -60,8 +60,8 @@
    ```
 * For non-CPU case, choose llama_cpp_python ARGS for your system according to [llama_cpp_python backend documentation](https://github.com/abetlen/llama-cpp-python?tab=readme-ov-file#supported-backends), e.g. for CUDA:
   ```cmdline
-   set CMAKE_ARGS=-DLLAMA_CUBLAS=on -DCMAKE_CUDA_ARCHITECTURES=all
-   set LLAMA_CUBLAS=1
+   set CMAKE_ARGS=-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=all
+   set GGML_CUDA=1
    set FORCE_CMAKE=1
   ```
   Note for some reason things will fail with llama_cpp_python if don't add all cuda arches, and building with all those arches does take some time.
diff --git a/docs/README_quickstart.md b/docs/README_quickstart.md
@@ -17,15 +17,15 @@ To quickly try out h2oGPT with limited document Q/A capability, create a fresh P
 Then choose your llama_cpp_python options, by changing `CMAKE_ARGS` to whichever system you have according to [llama_cpp_python backend documentation](https://github.com/abetlen/llama-cpp-python?tab=readme-ov-file#supported-backends).
 E.g. CUDA on Linux:
 ```bash
-export LLAMA_CUBLAS=1
-export CMAKE_ARGS="-DLLAMA_CUBLAS=on -DCMAKE_CUDA_ARCHITECTURES=all"
+export GGML_CUDA=1
+export CMAKE_ARGS="-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=all"
 export FORCE_CMAKE=1
 ```
 Note for some reason things will fail with llama_cpp_python if don't add all cuda arches, and building with all those arches does take some time.
 Windows CUDA:
 ```cmdline
-set CMAKE_ARGS=-DLLAMA_CUBLAS=on -DCMAKE_CUDA_ARCHITECTURES=all
-set LLAMA_CUBLAS=1
+set CMAKE_ARGS=-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=all
+set GGML_CUDA=1
 set FORCE_CMAKE=1
 ```
 Note for some reason things will fail with llama_cpp_python if don't add all cuda arches, and building with all those arches does take some time.
diff --git a/docs/linux_install_full.sh b/docs/linux_install_full.sh
@@ -53,8 +53,8 @@ conda install python=3.10 -c conda-forge -y
 
 export CUDA_HOME=/usr/local/cuda-12.1
 export PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cu121"
-export LLAMA_CUBLAS=1
-export CMAKE_ARGS="-DLLAMA_CUBLAS=on -DCMAKE_CUDA_ARCHITECTURES=all"
+export GGML_CUDA=1
+export CMAKE_ARGS="-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=all"
 export FORCE_CMAKE=1
 
 # get patches
diff --git a/reqs_optional/reqs_constraints.txt b/reqs_optional/reqs_constraints.txt
@@ -1,5 +1,6 @@
 # ensure doesn't drift, e.g. Issue #1348
-torch==2.3.1
+torch==2.2.1 ;sys_platform != "darwin" and platform_machine != "arm64"
+torch==2.3.1; sys_platform == "darwin" and platform_machine == "arm64"
 gradio==4.26.0
 gradio_client==0.15.1
 transformers>=4.43.2
diff --git a/reqs_optional/requirements_optional_llamacpp_gpt4all.txt b/reqs_optional/requirements_optional_llamacpp_gpt4all.txt
@@ -1,5 +1,5 @@
 gpt4all==1.0.5
 
 # requires env to be set for specific systems
-llama-cpp-python==0.2.85
+llama-cpp-python==0.2.87
 
diff --git a/requirements.txt b/requirements.txt
@@ -17,7 +17,7 @@ huggingface_hub>=0.23.3
 appdirs>=1.4.4
 fire>=0.5.0
 docutils>=0.20.1
-torch==2.3.1; sys_platform != "darwin" and platform_machine != "arm64"
+torch==2.2.1; sys_platform != "darwin" and platform_machine != "arm64"
 torch==2.3.1; sys_platform == "darwin" and platform_machine == "arm64"
 evaluate>=0.4.0
 rouge_score>=0.1.2
@@ -32,8 +32,9 @@ matplotlib>=3.7.1
 
 # transformers
 loralib>=0.1.2
+bitsandbytes>=0.43.1; sys_platform != "darwin" and platform_machine != "arm64"
 #bitsandbytes downgraded because of Mac M1/M2 support issue. See https://github.com/axolotl-ai-cloud/axolotl/issues/1436
-bitsandbytes==0.42.0
+bitsandbytes==0.42.0; sys_platform == "darwin" and platform_machine == "arm64"
 accelerate>=0.30.1
 peft>=0.7.0
 transformers>=4.43.2
diff --git a/src/version.py b/src/version.py
@@ -1 +1 @@
-__version__ = "7435b4bc4c0e6559fd90e89f7a3f51f9353ccf89"
+__version__ = "fc5a7031e4086a2878797fd062547da051b50e0d"

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "7435b4bc4c0e6559fd90e89f7a3f51f9353ccf89"`
	`1`	`+__version__ = "fc5a7031e4086a2878797fd062547da051b50e0d"`