Examples¶
This section contains a step-by-step tutorial on how to use the fishy-business library programmatically.
Getting Started¶
The simplest way to run a training experiment using the high-level run_unified_training interface.
1# -*- coding: utf-8 -*-
2"""
3Tutorial 01: Getting Started
4----------------------------
5This tutorial demonstrates the simplest way to run a training experiment
6using the high-level `run_unified_training` interface.
7"""
8
9from pathlib import Path
10from fishy._core.config import TrainingConfig
11from fishy.experiments.unified_trainer import run_unified_training
12from fishy.cli.main import display_final_summary
13
14# Set up the path to your data
15PROJECT_ROOT = Path(__file__).resolve().parent.parent
16
17
18def main():
19 print("--- Tutorial 01: Getting Started ---")
20
21 # 1. Define a minimal configuration
22 config = TrainingConfig(
23 model="transformer",
24 dataset="species",
25 epochs=5,
26 batch_size=32,
27 wandb_log=False,
28 )
29
30 print(f"Launching a {config.model} training on the {config.dataset} dataset...")
31
32 # 2. Run the experiment
33 results = run_unified_training(config)
34
35 # 3. Inspect the results using the beautiful summary table
36 display_final_summary(results)
37
38
39if __name__ == "__main__":
40 main()
DataModule and Processing¶
Learn how the DataModule handles data loading, filtering, and conversion into PyTorch-ready tensors.
1# -*- coding: utf-8 -*-
2"""
3Tutorial 02: DataModule and Data Processing
4-------------------------------------------
5This tutorial explains how the `DataModule` handles data loading,
6filtering, and conversion into PyTorch-ready tensors.
7"""
8
9from pathlib import Path
10from fishy.data.module import create_data_module
11
12# Path to the dataset
13PROJECT_ROOT = Path(__file__).resolve().parent.parent
14
15
16def main():
17 print("--- Tutorial 02: DataModule and Data Processing ---")
18
19 # 1. Create a DataModule
20 # You can select different datasets defined in fishy/configs/datasets.yaml
21 dataset_name = "species"
22 dm = create_data_module(dataset_name=dataset_name)
23
24 print(f"Initializing DataModule for: {dataset_name}")
25
26 # 2. Setup the module
27 # This triggers the actual loading from Excel/CSV and applies filters.
28 dm.setup()
29
30 # 3. Inspect metadata
31 # The module automatically determines input dimension and classes from the data.
32 print(f" Input Dimension (features): {dm.get_input_dim()}")
33 print(f" Number of Classes: {dm.get_num_classes()}")
34 print(f" Class Names: {dm.get_class_names()}")
35
36 # 4. Accessing Tensors
37 # You can get the full dataset as NumPy arrays for inspection or traditional ML.
38 X, y = dm.get_numpy_data(labels_as_indices=True)
39 print(f"\nNumPy Data Shape: X={X.shape}, y={y.shape}")
40
41 # 5. Accessing the PyTorch DataLoader
42 # This is what's used during the deep learning training loop.
43 loader = dm.get_train_dataloader()
44 first_batch = next(iter(loader))
45 spectra, labels = first_batch
46
47 print("\nFirst PyTorch Batch:")
48 print(f" Spectra tensor shape: {spectra.shape}")
49 print(f" Labels tensor shape: {labels.shape}")
50
51
52if __name__ == "__main__":
53 main()
Configuration Management¶
Using TrainingConfig and ExperimentConfig to centralize hyperparameters and experimental settings.
1# -*- coding: utf-8 -*-
2"""
3Tutorial 03: Configuration Management
4-------------------------------------
5This tutorial covers the `TrainingConfig` and `ExperimentConfig` classes,
6which centralize all hyperparameters and experimental settings.
7"""
8
9from fishy._core.config import TrainingConfig, ExperimentConfig
10from pathlib import Path
11
12
13def main():
14 print("--- Tutorial 03: Configuration Management ---")
15
16 # 1. Single Run Configuration (TrainingConfig)
17 # This class holds everything needed for one training session.
18 train_cfg = TrainingConfig(
19 model="cnn",
20 dataset="part",
21 epochs=10,
22 learning_rate=5e-4,
23 batch_size=16,
24 data_augmentation=True, # Enable built-in augmentation
25 )
26
27 print("\nTrainingConfig created:")
28 print(f" Model: {train_cfg.model}")
29 print(f" Augmentation: {train_cfg.data_augmentation}")
30
31 # 2. Saving and Loading YAML
32 # Configs can be serialized to disk for reproducibility or CLI use.
33 yaml_path = "example_config.yaml"
34 train_cfg.to_yaml(yaml_path)
35 print(f" Config saved to {yaml_path}")
36
37 # Loading it back
38 loaded_cfg = TrainingConfig.from_yaml(yaml_path)
39 print(f" Loaded Model: {loaded_cfg.model}")
40
41 # 3. Batch Configuration (ExperimentConfig)
42 # Used to orchestrate multiple models across multiple datasets.
43 exp_cfg = ExperimentConfig(
44 name="my_first_batch",
45 num_runs=5, # Run each combination 5 times for statistics
46 datasets=["species", "oil"],
47 models=["cnn", "transformer", "opls-da"],
48 benchmark=True, # Enable performance measuring for all
49 overrides={"epochs": 2}, # Force these settings on all runs
50 )
51
52 print("\nExperimentConfig created:")
53 print(f" Batch Name: {exp_cfg.name}")
54 print(f" Total combinations: {len(exp_cfg.datasets) * len(exp_cfg.models)}")
55
56 # Clean up
57 if Path(yaml_path).exists():
58 Path(yaml_path).unlink()
59
60
61if __name__ == "__main__":
62 main()
Training Engines¶
Exploring different ways to train models, from automated orchestration to direct control over the training loop.
1# -*- coding: utf-8 -*-
2"""
3Tutorial 04: Training Engines (High vs. Low Level)
4--------------------------------------------------
5This tutorial explores different ways to train models, from automated
6orchestration to direct control over the training loop.
7"""
8
9import torch
10import torch.nn as nn
11from torch.utils.data import DataLoader, TensorDataset
12from fishy.engine.trainer import Trainer
13from fishy._core.config import TrainingConfig
14from fishy.experiments.unified_trainer import run_unified_training
15from pathlib import Path
16
17PROJECT_ROOT = Path(__file__).resolve().parent.parent
18
19
20def main():
21 print("--- Tutorial 04: Training Engines ---")
22
23 # --- LEVEL 1: Unified Orchestration ---
24 # Good for standard experiments and benchmarking.
25 print("\n--- Level 1: run_unified_training ---")
26 config = TrainingConfig(
27 model="transformer",
28 dataset="species",
29 epochs=2,
30 batch_size=32,
31 wandb_log=False,
32 )
33 results = run_unified_training(config)
34 print(f"Orchestrated Accuracy: {results.get('val_balanced_accuracy', 0):.4f}")
35
36 # --- LEVEL 2: Direct Trainer usage ---
37 # Good for when you have your own model/data but want our optimized loop.
38 print("\n--- Level 2: Custom Trainer Control ---")
39
40 # 1. Create a simple PyTorch model
41 model = nn.Sequential(nn.Linear(100, 64), nn.ReLU(), nn.Linear(64, 2))
42
43 # 2. Setup your own data
44 x = torch.randn(32, 100)
45 y = torch.randint(0, 2, (32,))
46 loader = DataLoader(TensorDataset(x, y), batch_size=8)
47
48 # 3. Use the Trainer class directly
49 trainer = Trainer(
50 model=model,
51 criterion=nn.CrossEntropyLoss(),
52 optimizer=torch.optim.Adam(model.parameters(), lr=1e-3),
53 device=torch.device("cpu"),
54 num_epochs=2,
55 )
56
57 # 4. Execute training
58 # This gives you raw access to epoch logs and best model states.
59 train_res = trainer.train(loader, val_loader=loader)
60 print(f"Manual Trainer Accuracy: {train_res['best_accuracy']:.4f}")
61
62
63if __name__ == "__main__":
64 main()
Automated Benchmarking¶
How to trigger the full automated benchmark suite used in research papers.
1# -*- coding: utf-8 -*-
2"""
3Tutorial 05: Automated Benchmarking (Run All)
4---------------------------------------------
5This tutorial shows how to trigger the full automated benchmark suite
6used in the paper, covering all datasets and model categories.
7"""
8
9from fishy.experiments.unified_trainer import run_all_benchmarks
10
11
12def main():
13 print("--- Tutorial 05: Automated Benchmarking ---")
14
15 # The `run_all_benchmarks` function is a "one-button" solution to
16 # compare every Classic, Deep, and Evolutionary model against each other.
17 # It performs repeated cross-validation and statistical significance tests.
18
19 # For this example, we use `quick=True` to run a very small subset
20 # (2 models, 1 dataset, 2 runs) instead of the full 30-run suite.
21 print("Launching quick benchmark suite...")
22
23 summary_df = run_all_benchmarks(quick=True, wandb_log=False)
24
25 print("\nBenchmark Summary Table:")
26 print(summary_df.to_string())
27
28
29if __name__ == "__main__":
30 main()
Self-Supervised Pre-training¶
Demonstrates how to use unlabeled or semi-labeled data to pre-train a model using various self-supervised tasks.
1# -*- coding: utf-8 -*-
2"""
3Tutorial 06: Self-Supervised Pre-training
4-----------------------------------------
5This tutorial demonstrates how to use unlabeled or semi-labeled data
6to pre-train a model using various self-supervised tasks.
7"""
8
9from pathlib import Path
10from fishy._core.config import TrainingConfig
11from fishy.experiments.deep_training import ModelTrainer
12
13PROJECT_ROOT = Path(__file__).resolve().parent.parent
14
15
16def main():
17 print("--- Tutorial 06: Self-Supervised Pre-training ---")
18
19 # 1. Define pre-training tasks in the config
20 # Here we enable Masked Spectra Modelling (MSM) and Denoising (SDA).
21 config = TrainingConfig(
22 model="transformer",
23 dataset="species",
24 epochs=5,
25 masked_spectra_modelling=True,
26 spectrum_denoising_autoencoding=True,
27 wandb_log=False,
28 )
29
30 # 2. Initialize the high-level ModelTrainer
31 trainer = ModelTrainer(config)
32
33 # 3. Run the pre-training phase
34 # This will sequentially execute each enabled task, chaining the weights.
35 print(f"Starting pre-training tasks for {config.model}...")
36 pre_trained_model = trainer.pre_train()
37
38 if pre_trained_model:
39 print("\nPre-training successful! Model is now ready for fine-tuning.")
40
41 # 4. Optional: Proceed to fine-tuning with the learned weights
42 # results = trainer.train(pre_trained_model)
43 # print(f"Fine-tuned Accuracy: {results.get('balanced_accuracy', 0):.4f}")
44
45
46if __name__ == "__main__":
47 main()
Sequential Transfer Learning¶
How to transfer knowledge from one dataset to another sequentially, using different classes/tasks at each stage.
1# -*- coding: utf-8 -*-
2"""
3Tutorial 07: Sequential Transfer Learning
4-----------------------------------------
5This tutorial shows how to transfer knowledge from one dataset to another
6sequentially, using different classes/tasks at each stage.
7"""
8
9from pathlib import Path
10from fishy.experiments.transfer import run_sequential_transfer_learning
11
12PROJECT_ROOT = Path(__file__).resolve().parent.parent
13
14
15def main():
16 print("--- Tutorial 07: Sequential Transfer Learning ---")
17
18 # We want to train on 'part' (e.g. skin vs fillet)
19 # and then transfer that knowledge to 'species' (e.g. hoki vs mackerel).
20
21 # run_sequential_transfer_learning automates:
22 # 1. Loading the source dataset
23 # 2. Training the model
24 # 3. Swapping the classification head
25 # 4. Loading the target dataset
26 # 5. Fine-tuning the final model
27
28 print("Starting Transfer Learning: [part] -> [species]")
29
30 model, history = run_sequential_transfer_learning(
31 model_name="transformer",
32 transfer_datasets=["part"],
33 target_dataset="species",
34 num_epochs_transfer=5, # Short run for example
35 num_epochs_finetune=5,
36 batch_size=32,
37 wandb_log=False,
38 )
39
40 print("\nTransfer Learning complete.")
41 print(
42 f"Final accuracy on target dataset: {history['finetune']['species']['val_balanced_acc'][-1]:.2f}%"
43 )
44
45
46if __name__ == "__main__":
47 main()
Probabilistic Inference¶
Using Bayesian models like Gaussian Processes to get predictions along with uncertainty estimates.
1# -*- coding: utf-8 -*-
2"""
3Tutorial 08: Probabilistic Inference and Uncertainty
4----------------------------------------------------
5This tutorial demonstrates how to use Bayesian models like Gaussian Processes
6to get not just predictions, but also uncertainty estimates.
7"""
8
9from fishy._core.config import TrainingConfig
10from fishy.experiments.classic_training import run_sklearn_experiment
11from fishy.data.module import create_data_module
12from sklearn.model_selection import train_test_split
13from pathlib import Path
14import numpy as np
15
16PROJECT_ROOT = Path(__file__).resolve().parent.parent
17
18
19def main():
20 print("--- Tutorial 08: Probabilistic Inference ---")
21
22 # 1. Configure a Probabilistic Model
23 # We'll use Gaussian Process (GP) which is registered in `models.yaml`.
24 config = TrainingConfig(
25 model="gp",
26 dataset="species",
27 k_folds=2, # Fast for example
28 )
29
30 # 2. Run the experiment
31 print(f"Training {config.model} with uncertainty estimation...")
32 stats = run_sklearn_experiment(config, "gp", "species")
33
34 print(f"\nGP Mean Accuracy: {stats['val_balanced_accuracy']:.4f}")
35
36 # 3. Manual Uncertainty Inspection
37 # Let's see how we can get uncertainty from the model directly.
38 dm = create_data_module("species")
39 dm.setup()
40 X, y = dm.get_numpy_data(labels_as_indices=True)
41
42 # SHUFFLE to ensure we get both classes in a small subset
43 X_train, X_test, y_train, y_test = train_test_split(
44 X, y, train_size=50, stratify=y, random_state=42
45 )
46
47 # Instantiate the GP class (Scikit-learn wrapper)
48 from fishy.models.probabilistic.gp import GP
49
50 model = GP()
51
52 # Fit on the shuffled subset
53 model.fit(X_train, y_train)
54
55 # Get uncertainty (1.0 - max_prob)
56 uncertainty = model.get_uncertainty(X_test[:5])
57 preds = model.predict(X_test[:5])
58
59 print("\nPredictions with Uncertainty (Test Subset):")
60 for i, (p, u) in enumerate(zip(preds, uncertainty)):
61 print(f" Sample {i}: Pred={p}, Uncertainty={u:.4f}")
62
63
64if __name__ == "__main__":
65 main()
Outputs and Visualization¶
Where to find experiment results and how to interpret generated artifacts like logs, metrics, and figures.
1# -*- coding: utf-8 -*-
2"""
3Tutorial 09: Outputs and Visualization
4--------------------------------------
5This tutorial explains where to find the results of your experiments
6and how to interpret the generated artifacts.
7"""
8
9from fishy._core.utils import RunContext
10from pathlib import Path
11
12
13def main():
14 print("--- Tutorial 09: Outputs and Visualization ---")
15
16 # Every time you run an experiment, a `RunContext` is created.
17 # It automatically creates a structured output directory:
18 # outputs/{dataset}/{method}/{model}_{timestamp}/
19
20 ctx = RunContext(dataset="species", method="deep", model_name="transformer")
21
22 print(f"\nRun directory created at: {ctx.run_dir}")
23
24 # 1. Logs: Found in {run_dir}/logs/experiment.log
25 print(f" Logs: {ctx.log_dir}")
26
27 # 2. Metrics: Found in {run_dir}/results/metrics.json
28 # These are saved using the custom NumpyEncoder.
29 print(f" Results: {ctx.result_dir}")
30
31 # 3. Figures: Found in {run_dir}/figures/
32 # If `figures=True` is set in config, you'll see training curves here.
33 print(f" Figures: {ctx.figure_dir}")
34
35 # 4. Checkpoints: Found in {run_dir}/checkpoints/
36 # Best model weights are saved here during training.
37 print(f" Checkpoints: {ctx.checkpoint_dir}")
38
39 print("\nTo generate these automatically during a run, ensure your config has:")
40 print(" config.benchmark = True")
41 print(" config.figures = True")
42
43
44if __name__ == "__main__":
45 main()