unslothai · danielhanchen · Jun 13, 2024 · May 19, 2024 · May 19, 2024 · May 19, 2024
diff --git a/PARAMETERS.md b/PARAMETERS.md
@@ -0,0 +1,87 @@
+## LoraConfig Parameters
+
+Adjusting the `LoraConfig` parameters allows you to balance model performance and computational efficiency in Low-Rank Adaptation (LoRA). Here’s a concise breakdown of key parameters:
+
+**r**
+- **Description**: Rank of the low-rank decomposition for factorizing weight matrices.
+- **Impact**:
+  - **Higher**: Retains more information, increases computational load.
+  - **Lower**: Fewer parameters, more efficient training, potential performance drop if too small.
+
+
+**lora_alpha**
+- **Description**: Scaling factor for the low-rank matrices' contribution.
+- **Impact**:
+  - **Higher**: Increases influence, speeds up convergence, risks instability or overfitting.
+  - **Lower**: Subtler effect, may require more training steps.
+
+**lora_dropout**
+- **Description**: Probability of zeroing out elements in low-rank matrices for regularization.
+- **Impact**:
+  - **Higher**: More regularization, prevents overfitting, may slow training and degrade performance.
+  - **Lower**: Less regularization, may speed up training, risks overfitting.
+
+**loftq_config**
+- **Description**: Configuration for LoftQ, a quantization method for the backbone weights and initialization of LoRA layers.
+- **Impact**:
+  - **Not None**: If specified, LoftQ will quantize the backbone weights and initialize the LoRA layers. It requires setting `init_lora_weights='loftq'`.
+  - **None**: LoftQ quantization is not applied.
+  - **Note**: Do not pass an already quantized model when using LoftQ as LoftQ handles the quantization process itself.
+
+
+**use_rslora**
+- **Description**: Enables Rank-Stabilized LoRA (RSLora).
+- **Impact**:
+  - **True**: Uses Rank-Stabilized LoRA, setting the adapter scaling factor to `lora_alpha/math.sqrt(r)`, which has been proven to work better as per the [Rank-Stabilized LoRA paper](https://doi.org/10.48550/arXiv.2312.03732).
+  - **False**: Uses the original default scaling factor `lora_alpha/r`.
+
+**gradient_accumulation_steps**
+- **Default**: 1
+- **Description**: The number of steps to accumulate gradients before performing a backpropagation update.
+- **Impact**: 
+  - **Higher**: Accumulate gradients over multiple steps, effectively increasing the batch size without requiring additional memory. This can improve training stability and convergence, especially with large models and limited hardware.
+  - **Lower**: Faster updates but may require more memory per step and can be less stable.
+
+**weight_decay**
+- **Default**: 0.01
+- **Description**: Regularization technique that applies a small penalty to the weights during training.
+- **Impact**:
+  - **Non-zero Value (e.g., 0.01)**: Adds a penalty proportional to the magnitude of the weights to the loss function, helping to prevent overfitting by discouraging large weights.
+  - **Zero**: No weight decay is applied, which can lead to overfitting, especially in large models or with small datasets.
+
+**learning_rate**
+- **Default**: 2e-4
+- **Description**: The rate at which the model updates its parameters during training.
+- **Impact**:
+  - **Higher**: Faster convergence but risks overshooting optimal parameters and causing instability in training.
+  - **Lower**: More stable and precise updates but may slow down convergence, requiring more training steps to achieve good performance.
+
+## Target Modules 
+
+**q_proj (query projection)**
+- **Description**: Part of the attention mechanism in transformer models, responsible for projecting the input into the query space.
+- **Impact**: Transforms the input into query vectors that are used to compute attention scores.
+
+**k_proj (key projection)**
+- **Description**: Projects the input into the key space in the attention mechanism.
+- **Impact**: Produces key vectors that are compared with query vectors to determine attention weights.
+
+**v_proj (value projection)**
+- **Description**: Projects the input into the value space in the attention mechanism.
+- **Impact**: Produces value vectors that are weighted by the attention scores and combined to form the output.
+
+**o_proj (output projection)**
+- **Description**: Projects the output of the attention mechanism back into the original space.
+- **Impact**: Transforms the combined weighted value vectors back to the input dimension, integrating attention results into the model.
+
+**gate_proj (gate projection)**
+- **Description**: Typically used in gated mechanisms within neural networks, such as gating units in gated recurrent units (GRUs) or other gating mechanisms.
+- **Impact**: Controls the flow of information through the gate, allowing selective information passage based on learned weights.
+
+**up_proj (up projection)**
+- **Description**: Used for up-projection, typically increasing the dimensionality of the input.
+- **Impact**: Expands the input to a higher-dimensional space, often used in feedforward layers or when transitioning between different layers with differing dimensionalities.
+
+**down_proj (down projection)**
+- **Description**: Used for down-projection, typically reducing the dimensionality of the input.
+- **Impact**: Compresses the input to a lower-dimensional space, useful for reducing computational complexity and controlling the model size.
diff --git a/unsloth/__init__.py b/unsloth/__init__.py
@@ -14,8 +14,20 @@
 import os
 import warnings
 import importlib
+import sys
+from packaging.version import Version
 
-# Currently only supports 1 GPU, or else seg faults will occur.
+# Define a list of modules to check
+MODULES_TO_CHECK = ["peft", "bitsandbytes"]
+
+# Check if any of the modules in the list have been imported
+for module in MODULES_TO_CHECK:
+    if module in sys.modules:
+        raise ImportError(f"Unsloth: Please import Unsloth before {module}.")
+    pass
+pass
+
+# Currently only supports 1 GPU, or else seg faults will occur.    
 if "CUDA_VISIBLE_DEVICES" in os.environ:
     os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
     devices = os.environ["CUDA_VISIBLE_DEVICES"]
@@ -66,8 +78,14 @@ def is_bf16_supported(): return SUPPORTS_BFLOAT16
 
 # Try loading bitsandbytes and triton
 import bitsandbytes as bnb
+
 import triton
-from triton.common.build import libcuda_dirs
+libcuda_dirs = lambda: None
+if Version(triton.__version__) >= Version("3.0.0"):
+    try: from triton.backends.nvidia.driver import libcuda_dirs
+    except: pass
+else: from triton.common.build import libcuda_dirs
+
 import os
 import re
 import numpy as np
@@ -103,8 +121,11 @@ def is_bf16_supported(): return SUPPORTS_BFLOAT16
     importlib.reload(bnb)
     importlib.reload(triton)
     try:
-        import bitsandbytes as bnb
-        from triton.common.build import libcuda_dirs
+        libcuda_dirs = lambda: None
+        if Version(triton.__version__) >= Version("3.0.0"):
+            try: from triton.backends.nvidia.driver import libcuda_dirs
+            except: pass
+        else: from triton.common.build import libcuda_dirs
         cdequantize_blockwise_fp32 = bnb.functional.lib.cdequantize_blockwise_fp32
         libcuda_dirs()
     except:

diff --git a/unsloth/chat_templates.py b/unsloth/chat_templates.py
@@ -1286,7 +1286,7 @@ def test_hf_gguf_equivalence(tokenizer, gguf_model = "./model-unsloth.F16.gguf")
     pass
 
     for prompt in prompts:
-        command = f"./llama.cpp/main -m {gguf_model} -n 0 --temp 0.0 --verbose-prompt "\
+        command = f"./llama.cpp/llama-cli -m {gguf_model} -n 0 --temp 0.0 --verbose-prompt "\
             f"--check-tensors -p '{prompt}'"
 
         datas = []

diff --git a/unsloth/kernels/__init__.py b/unsloth/kernels/__init__.py
@@ -24,6 +24,7 @@
 )
 from .fast_lora import (
 	get_lora_parameters,
+	get_lora_parameters_bias,
 	apply_lora_mlp_swiglu,
 	apply_lora_mlp_geglu_exact,
 	apply_lora_mlp_geglu_approx,

diff --git a/unsloth/kernels/fast_lora.py b/unsloth/kernels/fast_lora.py
@@ -13,7 +13,13 @@
 # limitations under the License.
 
 import torch
-from .utils import fast_dequantize, QUANT_STATE, get_lora_parameters, matmul_lora
+from .utils import (
+    fast_dequantize,
+    QUANT_STATE,
+    get_lora_parameters,
+    get_lora_parameters_bias,
+    matmul_lora,
+)
 
 
 class LoRA_MLP(torch.autograd.Function):

diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
@@ -33,11 +33,8 @@
 
 def _get_model_name(model_name, load_in_4bit = True):
 
-    # First try replacing lowercase 'b' with uppercase 'B'
-    model_name = model_name.lower()
-
     if not SUPPORTS_FOURBIT and model_name in INT_TO_FLOAT_MAPPER:
-        model_name = INT_TO_FLOAT_MAPPER[model_name]
+        model_name = INT_TO_FLOAT_MAPPER[model_name.lower()]
         logger.warning_once(
             f"Unsloth: Your transformers version of {transformers_version} does not support native "\
             f"4bit loading.\nThe minimum required version is 4.37.\n"\
@@ -47,15 +44,15 @@ def _get_model_name(model_name, load_in_4bit = True):
         )
 
     elif not load_in_4bit and model_name in INT_TO_FLOAT_MAPPER:
-        new_model_name = INT_TO_FLOAT_MAPPER[model_name]
+        new_model_name = INT_TO_FLOAT_MAPPER[model_name.lower()]
         # logger.warning_once(
         #     f"Unsloth: You passed in `{model_name}` which is a 4bit model, yet you set\n"\
         #     f"`load_in_4bit = False`. We shall load `{new_model_name}` instead."
         # )
         model_name = new_model_name
 
     elif load_in_4bit and SUPPORTS_FOURBIT and model_name in FLOAT_TO_INT_MAPPER:
-        new_model_name = FLOAT_TO_INT_MAPPER[model_name]
+        new_model_name = FLOAT_TO_INT_MAPPER[model_name.lower()]
         # logger.warning_once(
         #     f"Unsloth: You passed in `{model_name}` and `load_in_4bit = True`.\n"\
         #     f"We shall load `{new_model_name}` for 4x faster loading."
@@ -70,17 +67,18 @@ def _get_model_name(model_name, load_in_4bit = True):
 class FastLanguageModel(FastLlamaModel):
     @staticmethod
     def from_pretrained(
-        model_name     = "unsloth/llama-3-8b-bnb-4bit",
-        max_seq_length = None,
-        dtype          = None,
-        load_in_4bit   = True,
-        token          = None,
-        device_map     = "sequential",
-        rope_scaling   = None,
-        fix_tokenizer  = True,
-        trust_remote_code = False,
-        use_gradient_checkpointing = True,
-        resize_model_vocab = None,
+        model_name                 = "unsloth/llama-3-8b-bnb-4bit",
+        max_seq_length             = None,
+        dtype                      = None,
+        load_in_4bit               = True,
+        token                      = None,
+        device_map                 = "sequential",
+        rope_scaling               = None,
+        fix_tokenizer              = True,
+        trust_remote_code          = False,
+        use_gradient_checkpointing = "unsloth",
+        resize_model_vocab         = None,
+        revision                   = None,
         *args, **kwargs,
     ):
         if token is None and "HF_TOKEN" in os.environ:
@@ -95,12 +93,12 @@ def from_pretrained(
         # First check if it's a normal model via AutoConfig
         is_peft = False
         try:
-            model_config = AutoConfig.from_pretrained(model_name, token = token)
+            model_config = AutoConfig.from_pretrained(model_name, token = token, revision = revision)
             is_peft = False
         except:
             try:
                 # Most likely a PEFT model
-                peft_config = PeftConfig.from_pretrained(model_name, token = token)
+                peft_config = PeftConfig.from_pretrained(model_name, token = token, revision = revision)
             except:
                 raise RuntimeError(f"Unsloth: `{model_name}` is not a full model or a PEFT model.")
 
@@ -143,22 +141,24 @@ def from_pretrained(
         pass
 
         model, tokenizer = dispatch_model.from_pretrained(
-            model_name     = model_name,
-            max_seq_length = max_seq_length,
-            dtype          = dtype,
-            load_in_4bit   = load_in_4bit,
-            token          = token,
-            device_map     = device_map,
-            rope_scaling   = rope_scaling,
-            fix_tokenizer  = fix_tokenizer,
-            model_patcher  = dispatch_model,
-            tokenizer_name = tokenizer_name,
+            model_name        = model_name,
+            max_seq_length    = max_seq_length,
+            dtype             = dtype,
+            load_in_4bit      = load_in_4bit,
+            token             = token,
+            device_map        = device_map,
+            rope_scaling      = rope_scaling,
+            fix_tokenizer     = fix_tokenizer,
+            model_patcher     = dispatch_model,
+            tokenizer_name    = tokenizer_name,
             trust_remote_code = trust_remote_code,
+            revision          = revision if not is_peft else None,
             *args, **kwargs,
         )
 
         if resize_model_vocab is not None:
             model.resize_token_embeddings(resize_model_vocab)
+        pass
 
         # In case the model supports tagging, add the unsloth tag.
         if hasattr(model, "add_model_tags"):
@@ -188,8 +188,16 @@ def from_pretrained(
         pass
 
         if is_peft:
+            # From https://github.com/huggingface/peft/issues/184
             # Now add PEFT adapters
-            model = PeftModel.from_pretrained(model, old_model_name, token = token)
+            model.enable_input_require_grads()
+            model = PeftModel.from_pretrained(
+                model,
+                old_model_name,
+                token = token,
+                revision = revision,
+                is_trainable = True,
+            )
             # Patch it as well!
             model = dispatch_model.patch_peft_model(model, use_gradient_checkpointing)
         pass

diff --git a/unsloth/models/mapper.py b/unsloth/models/mapper.py
@@ -186,6 +186,9 @@
     "unsloth/Qwen2-70B-Instruct-bnb-4bit" : (
         "Qwen/Qwen2-70B-Instruct",
     ),
+    "mistralai/Codestral-22B-v0.1" : (
+        "mistral-community/Codestral-22B-v0.1",
+    ),
 }
 
 INT_TO_FLOAT_MAPPER = {}