L
Initializing Studio...
Learn how to manage and work with datasets in LangTrain for optimal model training.
# Load dataset from various sources
from langtrain import Dataset
# From CSV
dataset = Dataset.from_csv('data.csv',
text_column='text',
label_column='label')
# From JSON
dataset = Dataset.from_json('data.jsonl')
# From HuggingFace
dataset = Dataset.from_huggingface('imdb')
# Custom preprocessing
dataset = Dataset.from_custom(
path='custom_data/',
preprocessor=custom_preprocessor
)# Data preprocessing pipeline
dataset = dataset.preprocess([
# Text cleaning
dataset.clean_text(remove_urls=True, remove_special=True),
# Tokenization
dataset.tokenize(tokenizer='bert-base-uncased', max_length=512),
# Data augmentation
dataset.augment(techniques=['synonym_replacement', 'back_translation']),
# Train/validation split
dataset.split(train_size=0.8, stratify=True)
])
# Custom preprocessing function
def custom_preprocess(batch):
batch['text'] = [text.lower().strip() for text in batch['text']]
return batch
dataset = dataset.map(custom_preprocess, batched=True)# Data quality analysis
quality_report = dataset.analyze_quality()
print(quality_report.summary())
# Validation checks
dataset.validate([
'check_missing_values',
'check_label_distribution',
'check_text_length',
'check_duplicates'
])
# Automatic data cleaning
dataset = dataset.clean(
remove_duplicates=True,
handle_missing='drop',
min_text_length=10,
max_text_length=1000
)# Version your datasets
dataset.save_version('v1.0', description='Initial dataset')
# Load specific version
dataset = Dataset.load_version('my_dataset', version='v1.0')
# Compare versions
comparison = Dataset.compare_versions('my_dataset', 'v1.0', 'v1.1')
print(comparison.statistics())