L
Initializing Studio...
Understand model evaluation metrics and best practices for measuring performance.
from langtrain import Evaluator
# Initialize evaluator
evaluator = Evaluator(task_type='text_classification')
# Built-in metrics
results = evaluator.evaluate(
model=model,
test_data=test_dataset,
metrics=['accuracy', 'f1_score', 'precision', 'recall']
)
print(f"Accuracy: {results['accuracy']:.4f}")
print(f"F1-Score: {results['f1_score']:.4f}")
# For text generation tasks
gen_evaluator = Evaluator(task_type='text_generation')
gen_results = gen_evaluator.evaluate(
model=model,
test_data=test_dataset,
metrics=['bleu', 'rouge', 'bert_score']
)# Define custom evaluation metric
def custom_domain_accuracy(predictions, labels, domain_weights):
"""Custom metric that weights accuracy by domain importance"""
correct = 0
total_weight = 0
for pred, label, weight in zip(predictions, labels, domain_weights):
if pred == label:
correct += weight
total_weight += weight
return correct / total_weight if total_weight > 0 else 0
# Register custom metric
evaluator.register_metric('domain_accuracy', custom_domain_accuracy)
# Use in evaluation
results = evaluator.evaluate(
model=model,
test_data=test_dataset,
metrics=['accuracy', 'domain_accuracy'],
metric_params={'domain_accuracy': {'domain_weights': weights}}
)# Cross-validation evaluation
from langtrain.evaluation import CrossValidator
cv = CrossValidator(
folds=5,
stratified=True,
random_state=42
)
cv_results = cv.evaluate(
model=model,
data=dataset,
metrics=['accuracy', 'f1_score']
)
print(f"CV Accuracy: {cv_results['accuracy'].mean():.4f} ± {cv_results['accuracy'].std():.4f}")
# Temporal split for time-series data
from langtrain.evaluation import TemporalSplit
temporal_split = TemporalSplit(
train_size=0.7,
val_size=0.15,
test_size=0.15,
time_column='timestamp'
)
train, val, test = temporal_split.split(dataset)
# Evaluate on temporal test set
temporal_results = evaluator.evaluate(
model=model,
test_data=test,
metrics=['accuracy', 'f1_score']
)# Compare multiple models
from langtrain.evaluation import ModelComparator
comparator = ModelComparator(
models=[model1, model2, model3],
model_names=['BERT', 'RoBERTa', 'DistilBERT']
)
comparison_results = comparator.compare(
test_data=test_dataset,
metrics=['accuracy', 'f1_score', 'inference_time'],
statistical_test='mcnemar' # McNemar's test for significance
)
# Generate comparison report
comparator.generate_report(
results=comparison_results,
output_path='model_comparison_report.html',
include_plots=True
)
print(comparison_results.summary())# Continuous evaluation setup
from langtrain.evaluation import ContinuousEvaluator
continuous_eval = ContinuousEvaluator(
model=model,
evaluation_schedule='daily',
alert_thresholds={
'accuracy': 0.85, # Alert if accuracy drops below 85%
'f1_score': 0.80
}
)
# Monitor data drift
continuous_eval.enable_drift_detection(
reference_data=training_data,
drift_threshold=0.1
)
# Set up alerts
continuous_eval.configure_alerts(
email=['team@company.com'],
slack_webhook='https://hooks.slack.com/...'
)
# Start monitoring
continuous_eval.start()