Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Nightly #632

Merged
merged 133 commits into from
Jun 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
133 commits
Select commit Hold shift + click to select a range
7df08c4
Update llama.py
danielhanchen May 19, 2024
ba5b6ce
offload
danielhanchen May 19, 2024
a07057e
Update llama.py
danielhanchen May 19, 2024
4be9063
Update llama.py
danielhanchen May 19, 2024
3dc3d3f
Update llama.py
danielhanchen May 19, 2024
f1cc1e8
Update llama.py
danielhanchen May 19, 2024
5cb531a
Update llama.py
danielhanchen May 19, 2024
6bd8e60
Update llama.py
danielhanchen May 19, 2024
d1d57ff
Update llama.py
danielhanchen May 19, 2024
7470f67
continued pretraining trainer
danielhanchen May 20, 2024
da9c1a6
Update trainer.py
danielhanchen May 20, 2024
2c68f56
Update trainer.py
danielhanchen May 20, 2024
217bf9d
Update trainer.py
danielhanchen May 20, 2024
6e85384
Update trainer.py
danielhanchen May 21, 2024
77f9c51
is_bfloat16_supported
danielhanchen May 21, 2024
c0e1d27
Update __init__.py
danielhanchen May 21, 2024
2b23b93
Update README.md
danielhanchen May 21, 2024
902e23a
Update llama.py
danielhanchen May 21, 2024
98f41ce
Merge branch 'main' into nightly
danielhanchen May 22, 2024
3193cac
is_bfloat16_supported
danielhanchen May 22, 2024
dfeaf4b
Update __init__.py
danielhanchen May 22, 2024
1e84090
Mistral v3
danielhanchen May 22, 2024
f63f32b
Merge branch 'main' into nightly
danielhanchen May 23, 2024
57ad8e7
Phi 3 medium
danielhanchen May 23, 2024
2b994b2
Update chat_templates.py
danielhanchen May 23, 2024
ff8171f
Update chat_templates.py
danielhanchen May 23, 2024
5ca8b58
Phi-3
danielhanchen May 23, 2024
98c2e81
Merge branch 'main' into nightly
danielhanchen May 23, 2024
3817660
Merge branch 'main' into nightly
danielhanchen May 23, 2024
f858145
Merge branch 'main' into nightly
danielhanchen May 24, 2024
a1328f6
Update save.py
danielhanchen May 24, 2024
fb29673
Update README.md
shimmyshimmer May 25, 2024
fa85556
Untrained tokens
danielhanchen May 26, 2024
c511aca
Update tokenizer_utils.py
danielhanchen May 26, 2024
35e7355
Update tokenizer_utils.py
danielhanchen May 26, 2024
cc0bf44
Update tokenizer_utils.py
danielhanchen May 26, 2024
674ba66
Update tokenizer_utils.py
danielhanchen May 26, 2024
9823f52
Update tokenizer_utils.py
danielhanchen May 26, 2024
c0c761b
Update tokenizer_utils.py
danielhanchen May 26, 2024
e2850c0
Update tokenizer_utils.py
danielhanchen May 26, 2024
8e12780
Update tokenizer_utils.py
danielhanchen May 26, 2024
6f1855e
Update tokenizer_utils.py
danielhanchen May 26, 2024
d27b173
Update tokenizer_utils.py
danielhanchen May 26, 2024
7bf7399
Update tokenizer_utils.py
danielhanchen May 26, 2024
31ecef9
Update tokenizer_utils.py
danielhanchen May 26, 2024
b67d93f
Update tokenizer_utils.py
danielhanchen May 26, 2024
e874ccd
Update tokenizer_utils.py
danielhanchen May 26, 2024
d7b54ff
Update tokenizer_utils.py
danielhanchen May 27, 2024
5a4a512
Update tokenizer_utils.py
danielhanchen May 27, 2024
82c040e
Update tokenizer_utils.py
danielhanchen May 27, 2024
8e227b2
Update tokenizer_utils.py
danielhanchen May 27, 2024
250d386
Update tokenizer_utils.py
danielhanchen May 27, 2024
e6db3ba
Update llama.py
danielhanchen May 27, 2024
e673fa2
Update tokenizer_utils.py
danielhanchen May 27, 2024
222b835
Update tokenizer_utils.py
danielhanchen May 27, 2024
6404aa5
Update tokenizer_utils.py
danielhanchen May 27, 2024
cfea7b2
Update tokenizer_utils.py
danielhanchen May 27, 2024
083e5ba
Update save.py
danielhanchen May 27, 2024
6f2565c
Update save.py
danielhanchen May 27, 2024
c19b04e
Update save.py
danielhanchen May 27, 2024
64b12a2
checkpoint
danielhanchen May 28, 2024
4cd5a8a
Merge branch 'main' into nightly
danielhanchen May 28, 2024
196faec
Update _utils.py
danielhanchen May 28, 2024
235be40
Update tokenizer_utils.py
danielhanchen May 29, 2024
cf9090a
Update tokenizer_utils.py
danielhanchen May 29, 2024
1fb1110
Update tokenizer_utils.py
danielhanchen May 29, 2024
d1bd60c
Update llama.py
danielhanchen May 30, 2024
732ead0
accelerate
danielhanchen May 30, 2024
359ae5c
Update _utils.py
danielhanchen May 30, 2024
8dcfad3
Update _utils.py
danielhanchen May 30, 2024
2bafc57
Update _utils.py
danielhanchen May 30, 2024
90f6311
Update _utils.py
danielhanchen May 30, 2024
7b84ff7
Update _utils.py
danielhanchen May 30, 2024
60f4b9a
Update _utils.py
danielhanchen May 30, 2024
3ebe5a5
Update _utils.py
danielhanchen May 30, 2024
7bbc8ce
Update tokenizer_utils.py
danielhanchen May 30, 2024
6f5c84c
train_dataloader
danielhanchen May 30, 2024
0d269ca
Update llama.py
danielhanchen May 30, 2024
6b7c142
Update llama.py
danielhanchen May 30, 2024
54f3a74
Update llama.py
danielhanchen May 30, 2024
0bc96c5
use_fast_convert
danielhanchen May 30, 2024
02c91b0
Merge branch 'main' into nightly
danielhanchen May 30, 2024
b384ff0
Merge branch 'main' into nightly
danielhanchen May 30, 2024
a8b5d89
Update save.py
danielhanchen May 30, 2024
872d569
Update save.py
danielhanchen May 30, 2024
3a1f5f2
Update save.py
danielhanchen May 30, 2024
bcadc8c
Update save.py
danielhanchen Jun 2, 2024
1381820
remove_special_tokens
danielhanchen Jun 2, 2024
e01b87d
Ollama
danielhanchen Jun 2, 2024
b3479c7
Update chat_templates.py
danielhanchen Jun 3, 2024
86804dc
Update chat_templates.py
danielhanchen Jun 3, 2024
87fdd3a
Update chat_templates.py
danielhanchen Jun 3, 2024
5c5df69
Merge branch 'main' into nightly
danielhanchen Jun 7, 2024
6386d94
Update llama.py
danielhanchen Jun 7, 2024
b1a9551
Update chat_templates.py
danielhanchen Jun 9, 2024
344a05d
Support bfloat16 GGUF
danielhanchen Jun 9, 2024
6b11e0d
Update save.py
danielhanchen Jun 9, 2024
c6e4b5b
Update llama.py
danielhanchen Jun 9, 2024
57f29ab
fast_forward_inference
danielhanchen Jun 9, 2024
d32e972
Update mapper.py
danielhanchen Jun 9, 2024
e121fa5
Update loader.py
danielhanchen Jun 9, 2024
5eaa10f
Update llama.py
danielhanchen Jun 9, 2024
f57d28d
Update tokenizer_utils.py
danielhanchen Jun 10, 2024
8937507
info
danielhanchen Jun 11, 2024
8982edb
edits
danielhanchen Jun 11, 2024
8904605
Create chat template
danielhanchen Jun 11, 2024
2a374c2
Fix tokenizer
danielhanchen Jun 12, 2024
d704b73
Merge branch 'main' into nightly
danielhanchen Jun 13, 2024
8176155
Update tokenizer_utils.py
danielhanchen Jun 13, 2024
21a99f1
fix case where gguf saving fails due to first_conversion dtype (#630)
chrehall68 Jun 13, 2024
dbf2dcf
Support revision parameter in FastLanguageModel.from_pretrained (#629)
chrehall68 Jun 13, 2024
9016171
clears any selected_adapters before calling internal_model.save_pretr…
neph1 Jun 13, 2024
0428920
Update __init__.py (#602)
xyangk Jun 13, 2024
9fdd847
Fixed unsloth/tokenizer_utils.py for chat training (#604)
Oseltamivir Jun 13, 2024
b5fc6aa
Add GGML saving option to Unsloth for easier Ollama model creation an…
mahiatlinux Jun 13, 2024
3fafbf7
docs: Add LoraConfig parameters documentation (#619)
sebdg Jun 13, 2024
273a871
llama.cpp failing (#371)
bet0x Jun 13, 2024
b312b3f
fix libcuda_dirs import for triton 3.0 (#227)
t-vi Jun 13, 2024
1601dca
Update save.py
danielhanchen Jun 13, 2024
26dc502
Update __init__.py
danielhanchen Jun 13, 2024
6a51657
Update fast_lora.py
danielhanchen Jun 13, 2024
4a8ba90
Update save.py
danielhanchen Jun 13, 2024
0abb5ba
Update save.py
danielhanchen Jun 13, 2024
b24dd05
Update save.py
danielhanchen Jun 13, 2024
48c6d6d
Update loader.py
danielhanchen Jun 13, 2024
e35f608
Update save.py
danielhanchen Jun 13, 2024
4822eae
Update save.py
danielhanchen Jun 13, 2024
7d847ed
quantize now llama-quantize
danielhanchen Jun 13, 2024
82f10cb
Update chat_templates.py
danielhanchen Jun 13, 2024
08424f0
Update loader.py
danielhanchen Jun 13, 2024
eb906d0
Update mapper.py
danielhanchen Jun 13, 2024
0a304ae
Update __init__.py
danielhanchen Jun 13, 2024
71edc42
embedding size
danielhanchen Jun 13, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
87 changes: 87 additions & 0 deletions PARAMETERS.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
## LoraConfig Parameters

Adjusting the `LoraConfig` parameters allows you to balance model performance and computational efficiency in Low-Rank Adaptation (LoRA). Here’s a concise breakdown of key parameters:

**r**
- **Description**: Rank of the low-rank decomposition for factorizing weight matrices.
- **Impact**:
- **Higher**: Retains more information, increases computational load.
- **Lower**: Fewer parameters, more efficient training, potential performance drop if too small.


**lora_alpha**
- **Description**: Scaling factor for the low-rank matrices' contribution.
- **Impact**:
- **Higher**: Increases influence, speeds up convergence, risks instability or overfitting.
- **Lower**: Subtler effect, may require more training steps.

**lora_dropout**
- **Description**: Probability of zeroing out elements in low-rank matrices for regularization.
- **Impact**:
- **Higher**: More regularization, prevents overfitting, may slow training and degrade performance.
- **Lower**: Less regularization, may speed up training, risks overfitting.

**loftq_config**
- **Description**: Configuration for LoftQ, a quantization method for the backbone weights and initialization of LoRA layers.
- **Impact**:
- **Not None**: If specified, LoftQ will quantize the backbone weights and initialize the LoRA layers. It requires setting `init_lora_weights='loftq'`.
- **None**: LoftQ quantization is not applied.
- **Note**: Do not pass an already quantized model when using LoftQ as LoftQ handles the quantization process itself.


**use_rslora**
- **Description**: Enables Rank-Stabilized LoRA (RSLora).
- **Impact**:
- **True**: Uses Rank-Stabilized LoRA, setting the adapter scaling factor to `lora_alpha/math.sqrt(r)`, which has been proven to work better as per the [Rank-Stabilized LoRA paper](https://doi.org/10.48550/arXiv.2312.03732).
- **False**: Uses the original default scaling factor `lora_alpha/r`.

**gradient_accumulation_steps**
- **Default**: 1
- **Description**: The number of steps to accumulate gradients before performing a backpropagation update.
- **Impact**:
- **Higher**: Accumulate gradients over multiple steps, effectively increasing the batch size without requiring additional memory. This can improve training stability and convergence, especially with large models and limited hardware.
- **Lower**: Faster updates but may require more memory per step and can be less stable.

**weight_decay**
- **Default**: 0.01
- **Description**: Regularization technique that applies a small penalty to the weights during training.
- **Impact**:
- **Non-zero Value (e.g., 0.01)**: Adds a penalty proportional to the magnitude of the weights to the loss function, helping to prevent overfitting by discouraging large weights.
- **Zero**: No weight decay is applied, which can lead to overfitting, especially in large models or with small datasets.

**learning_rate**
- **Default**: 2e-4
- **Description**: The rate at which the model updates its parameters during training.
- **Impact**:
- **Higher**: Faster convergence but risks overshooting optimal parameters and causing instability in training.
- **Lower**: More stable and precise updates but may slow down convergence, requiring more training steps to achieve good performance.

## Target Modules

**q_proj (query projection)**
- **Description**: Part of the attention mechanism in transformer models, responsible for projecting the input into the query space.
- **Impact**: Transforms the input into query vectors that are used to compute attention scores.

**k_proj (key projection)**
- **Description**: Projects the input into the key space in the attention mechanism.
- **Impact**: Produces key vectors that are compared with query vectors to determine attention weights.

**v_proj (value projection)**
- **Description**: Projects the input into the value space in the attention mechanism.
- **Impact**: Produces value vectors that are weighted by the attention scores and combined to form the output.

**o_proj (output projection)**
- **Description**: Projects the output of the attention mechanism back into the original space.
- **Impact**: Transforms the combined weighted value vectors back to the input dimension, integrating attention results into the model.

**gate_proj (gate projection)**
- **Description**: Typically used in gated mechanisms within neural networks, such as gating units in gated recurrent units (GRUs) or other gating mechanisms.
- **Impact**: Controls the flow of information through the gate, allowing selective information passage based on learned weights.

**up_proj (up projection)**
- **Description**: Used for up-projection, typically increasing the dimensionality of the input.
- **Impact**: Expands the input to a higher-dimensional space, often used in feedforward layers or when transitioning between different layers with differing dimensionalities.

**down_proj (down projection)**
- **Description**: Used for down-projection, typically reducing the dimensionality of the input.
- **Impact**: Compresses the input to a lower-dimensional space, useful for reducing computational complexity and controlling the model size.
29 changes: 25 additions & 4 deletions unsloth/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,20 @@
import os
import warnings
import importlib
import sys
from packaging.version import Version

# Currently only supports 1 GPU, or else seg faults will occur.
# Define a list of modules to check
MODULES_TO_CHECK = ["peft", "bitsandbytes"]

# Check if any of the modules in the list have been imported
for module in MODULES_TO_CHECK:
if module in sys.modules:
raise ImportError(f"Unsloth: Please import Unsloth before {module}.")
pass
pass

# Currently only supports 1 GPU, or else seg faults will occur.
if "CUDA_VISIBLE_DEVICES" in os.environ:
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
devices = os.environ["CUDA_VISIBLE_DEVICES"]
Expand Down Expand Up @@ -66,8 +78,14 @@ def is_bf16_supported(): return SUPPORTS_BFLOAT16

# Try loading bitsandbytes and triton
import bitsandbytes as bnb

import triton
from triton.common.build import libcuda_dirs
libcuda_dirs = lambda: None
if Version(triton.__version__) >= Version("3.0.0"):
try: from triton.backends.nvidia.driver import libcuda_dirs
except: pass
else: from triton.common.build import libcuda_dirs

import os
import re
import numpy as np
Expand Down Expand Up @@ -103,8 +121,11 @@ def is_bf16_supported(): return SUPPORTS_BFLOAT16
importlib.reload(bnb)
importlib.reload(triton)
try:
import bitsandbytes as bnb
from triton.common.build import libcuda_dirs
libcuda_dirs = lambda: None
if Version(triton.__version__) >= Version("3.0.0"):
try: from triton.backends.nvidia.driver import libcuda_dirs
except: pass
else: from triton.common.build import libcuda_dirs
cdequantize_blockwise_fp32 = bnb.functional.lib.cdequantize_blockwise_fp32
libcuda_dirs()
except:
Expand Down
2 changes: 1 addition & 1 deletion unsloth/chat_templates.py
Original file line number Diff line number Diff line change
Expand Up @@ -1286,7 +1286,7 @@ def test_hf_gguf_equivalence(tokenizer, gguf_model = "./model-unsloth.F16.gguf")
pass

for prompt in prompts:
command = f"./llama.cpp/main -m {gguf_model} -n 0 --temp 0.0 --verbose-prompt "\
command = f"./llama.cpp/llama-cli -m {gguf_model} -n 0 --temp 0.0 --verbose-prompt "\
f"--check-tensors -p '{prompt}'"

datas = []
Expand Down
1 change: 1 addition & 0 deletions unsloth/kernels/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
)
from .fast_lora import (
get_lora_parameters,
get_lora_parameters_bias,
apply_lora_mlp_swiglu,
apply_lora_mlp_geglu_exact,
apply_lora_mlp_geglu_approx,
Expand Down
8 changes: 7 additions & 1 deletion unsloth/kernels/fast_lora.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,13 @@
# limitations under the License.

import torch
from .utils import fast_dequantize, QUANT_STATE, get_lora_parameters, matmul_lora
from .utils import (
fast_dequantize,
QUANT_STATE,
get_lora_parameters,
get_lora_parameters_bias,
matmul_lora,
)


class LoRA_MLP(torch.autograd.Function):
Expand Down
68 changes: 38 additions & 30 deletions unsloth/models/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,11 +33,8 @@

def _get_model_name(model_name, load_in_4bit = True):

# First try replacing lowercase 'b' with uppercase 'B'
model_name = model_name.lower()

if not SUPPORTS_FOURBIT and model_name in INT_TO_FLOAT_MAPPER:
model_name = INT_TO_FLOAT_MAPPER[model_name]
model_name = INT_TO_FLOAT_MAPPER[model_name.lower()]
logger.warning_once(
f"Unsloth: Your transformers version of {transformers_version} does not support native "\
f"4bit loading.\nThe minimum required version is 4.37.\n"\
Expand All @@ -47,15 +44,15 @@ def _get_model_name(model_name, load_in_4bit = True):
)

elif not load_in_4bit and model_name in INT_TO_FLOAT_MAPPER:
new_model_name = INT_TO_FLOAT_MAPPER[model_name]
new_model_name = INT_TO_FLOAT_MAPPER[model_name.lower()]
# logger.warning_once(
# f"Unsloth: You passed in `{model_name}` which is a 4bit model, yet you set\n"\
# f"`load_in_4bit = False`. We shall load `{new_model_name}` instead."
# )
model_name = new_model_name

elif load_in_4bit and SUPPORTS_FOURBIT and model_name in FLOAT_TO_INT_MAPPER:
new_model_name = FLOAT_TO_INT_MAPPER[model_name]
new_model_name = FLOAT_TO_INT_MAPPER[model_name.lower()]
# logger.warning_once(
# f"Unsloth: You passed in `{model_name}` and `load_in_4bit = True`.\n"\
# f"We shall load `{new_model_name}` for 4x faster loading."
Expand All @@ -70,17 +67,18 @@ def _get_model_name(model_name, load_in_4bit = True):
class FastLanguageModel(FastLlamaModel):
@staticmethod
def from_pretrained(
model_name = "unsloth/llama-3-8b-bnb-4bit",
max_seq_length = None,
dtype = None,
load_in_4bit = True,
token = None,
device_map = "sequential",
rope_scaling = None,
fix_tokenizer = True,
trust_remote_code = False,
use_gradient_checkpointing = True,
resize_model_vocab = None,
model_name = "unsloth/llama-3-8b-bnb-4bit",
max_seq_length = None,
dtype = None,
load_in_4bit = True,
token = None,
device_map = "sequential",
rope_scaling = None,
fix_tokenizer = True,
trust_remote_code = False,
use_gradient_checkpointing = "unsloth",
resize_model_vocab = None,
revision = None,
*args, **kwargs,
):
if token is None and "HF_TOKEN" in os.environ:
Expand All @@ -95,12 +93,12 @@ def from_pretrained(
# First check if it's a normal model via AutoConfig
is_peft = False
try:
model_config = AutoConfig.from_pretrained(model_name, token = token)
model_config = AutoConfig.from_pretrained(model_name, token = token, revision = revision)
is_peft = False
except:
try:
# Most likely a PEFT model
peft_config = PeftConfig.from_pretrained(model_name, token = token)
peft_config = PeftConfig.from_pretrained(model_name, token = token, revision = revision)
except:
raise RuntimeError(f"Unsloth: `{model_name}` is not a full model or a PEFT model.")

Expand Down Expand Up @@ -143,22 +141,24 @@ def from_pretrained(
pass

model, tokenizer = dispatch_model.from_pretrained(
model_name = model_name,
max_seq_length = max_seq_length,
dtype = dtype,
load_in_4bit = load_in_4bit,
token = token,
device_map = device_map,
rope_scaling = rope_scaling,
fix_tokenizer = fix_tokenizer,
model_patcher = dispatch_model,
tokenizer_name = tokenizer_name,
model_name = model_name,
max_seq_length = max_seq_length,
dtype = dtype,
load_in_4bit = load_in_4bit,
token = token,
device_map = device_map,
rope_scaling = rope_scaling,
fix_tokenizer = fix_tokenizer,
model_patcher = dispatch_model,
tokenizer_name = tokenizer_name,
trust_remote_code = trust_remote_code,
revision = revision if not is_peft else None,
*args, **kwargs,
)

if resize_model_vocab is not None:
model.resize_token_embeddings(resize_model_vocab)
pass

# In case the model supports tagging, add the unsloth tag.
if hasattr(model, "add_model_tags"):
Expand Down Expand Up @@ -188,8 +188,16 @@ def from_pretrained(
pass

if is_peft:
# From https://github.com/huggingface/peft/issues/184
# Now add PEFT adapters
model = PeftModel.from_pretrained(model, old_model_name, token = token)
model.enable_input_require_grads()
model = PeftModel.from_pretrained(
model,
old_model_name,
token = token,
revision = revision,
is_trainable = True,
)
# Patch it as well!
model = dispatch_model.patch_peft_model(model, use_gradient_checkpointing)
pass
Expand Down
3 changes: 3 additions & 0 deletions unsloth/models/mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,9 @@
"unsloth/Qwen2-70B-Instruct-bnb-4bit" : (
"Qwen/Qwen2-70B-Instruct",
),
"mistralai/Codestral-22B-v0.1" : (
"mistral-community/Codestral-22B-v0.1",
),
}

INT_TO_FLOAT_MAPPER = {}
Expand Down
Loading