L
Initializing Studio...
Complete guide to full parameter fine-tuning for maximum model customization and performance.
# Full fine-tuning configuration
config = {
"method": "full",
"model": "llama-2-7b",
"learning_rate": 1e-5,
"batch_size": 8,
"gradient_accumulation_steps": 4,
"epochs": 3,
"warmup_steps": 500,
"weight_decay": 0.01,
"optimizer": "adamw",
"scheduler": "cosine"
}# Example training data format
{
"instruction": "Summarize the following text:",
"input": "Large language models have shown remarkable capabilities...",
"output": "LLMs demonstrate strong performance across many NLP tasks."
}
# Upload dataset
dataset = client.datasets.upload(
file_path="full_training_data.jsonl",
name="full-finetune-dataset",
validation_split=0.1
)# Start full fine-tuning job
job = client.fine_tune.create(
model="mistral-7b",
dataset=dataset.id,
config={
"method": "full",
"learning_rate": 5e-6,
"batch_size": 4,
"epochs": 2,
"gradient_checkpointing": True,
"fp16": True,
"deepspeed_stage": 2,
"save_steps": 500,
"logging_steps": 100,
"evaluation_strategy": "steps",
"eval_steps": 500
}
)
print(f"Full fine-tuning job started: {job.id}")# Distributed training configuration
distributed_config = {
"method": "full",
"distributed": {
"strategy": "deepspeed",
"stage": 3, # ZeRO stage 3 for maximum memory efficiency
"gradient_clipping": 1.0,
"allgather_bucket_size": 2e8,
"reduce_bucket_size": 2e8
},
"hardware": {
"gpu_count": 8,
"instance_type": "gpu-large",
"gradient_accumulation_steps": 16
}
}
# Launch distributed training
job = client.fine_tune.create(
model="llama-2-13b",
dataset=dataset.id,
config=distributed_config
)# Monitor training progress
while job.status in ["queued", "running"]:
job = client.fine_tune.get(job.id)
if job.metrics:
print(f"Step: {job.metrics.step}")
print(f"Training Loss: {job.metrics.train_loss:.4f}")
print(f"Validation Loss: {job.metrics.eval_loss:.4f}")
print(f"Learning Rate: {job.metrics.learning_rate:.2e}")
time.sleep(60)
print(f"Training completed with status: {job.status}")# Best practices configuration
best_practices_config = {
"method": "full",
"learning_rate": 2e-6, # Conservative learning rate
"weight_decay": 0.01, # L2 regularization
"dropout": 0.1, # Dropout for regularization
"gradient_clipping": 1.0, # Prevent gradient explosion
"early_stopping": {
"patience": 3,
"metric": "eval_loss",
"min_delta": 0.001
},
"save_strategy": "epoch",
"load_best_model_at_end": True
}