Examples¶

This section contains a step-by-step tutorial on how to use the fishy-business library programmatically.

Getting Started¶

The simplest way to run a training experiment using the high-level run_unified_training interface.

# -*- coding: utf-8 -*-
"""
Tutorial 01: Getting Started
----------------------------
This tutorial demonstrates the simplest way to run a training experiment
using the high-level `run_unified_training` interface.
"""

from pathlib import Path
from fishy._core.config import TrainingConfig
from fishy.experiments.unified_trainer import run_unified_training
from fishy.cli.main import display_final_summary

# Set up the path to your data
PROJECT_ROOT = Path(__file__).resolve().parent.parent


def main():
    print("--- Tutorial 01: Getting Started ---")

    # 1. Define a minimal configuration
    config = TrainingConfig(
        model="transformer",
        dataset="species",
        epochs=5,
        batch_size=32,
        wandb_log=False,
    )

    print(f"Launching a {config.model} training on the {config.dataset} dataset...")

    # 2. Run the experiment
    results = run_unified_training(config)

    # 3. Inspect the results using the beautiful summary table
    display_final_summary(results)


if __name__ == "__main__":
    main()

DataModule and Processing¶

Learn how the DataModule handles data loading, filtering, and conversion into PyTorch-ready tensors.

# -*- coding: utf-8 -*-
"""
Tutorial 02: DataModule and Data Processing
-------------------------------------------
This tutorial explains how the `DataModule` handles data loading,
filtering, and conversion into PyTorch-ready tensors.
"""

from pathlib import Path
from fishy.data.module import create_data_module

# Path to the dataset
PROJECT_ROOT = Path(__file__).resolve().parent.parent


def main():
    print("--- Tutorial 02: DataModule and Data Processing ---")

    # 1. Create a DataModule
    # You can select different datasets defined in fishy/configs/datasets.yaml
    dataset_name = "species"
    dm = create_data_module(dataset_name=dataset_name)

    print(f"Initializing DataModule for: {dataset_name}")

    # 2. Setup the module
    # This triggers the actual loading from Excel/CSV and applies filters.
    dm.setup()

    # 3. Inspect metadata
    # The module automatically determines input dimension and classes from the data.
    print(f"  Input Dimension (features): {dm.get_input_dim()}")
    print(f"  Number of Classes:          {dm.get_num_classes()}")
    print(f"  Class Names:                {dm.get_class_names()}")

    # 4. Accessing Tensors
    # You can get the full dataset as NumPy arrays for inspection or traditional ML.
    X, y = dm.get_numpy_data(labels_as_indices=True)
    print(f"\nNumPy Data Shape: X={X.shape}, y={y.shape}")

    # 5. Accessing the PyTorch DataLoader
    # This is what's used during the deep learning training loop.
    loader = dm.get_train_dataloader()
    first_batch = next(iter(loader))
    spectra, labels = first_batch

    print("\nFirst PyTorch Batch:")
    print(f"  Spectra tensor shape: {spectra.shape}")
    print(f"  Labels tensor shape:  {labels.shape}")


if __name__ == "__main__":
    main()

Configuration Management¶

Using TrainingConfig and ExperimentConfig to centralize hyperparameters and experimental settings.

# -*- coding: utf-8 -*-
"""
Tutorial 03: Configuration Management
-------------------------------------
This tutorial covers the `TrainingConfig` and `ExperimentConfig` classes,
which centralize all hyperparameters and experimental settings.
"""

from fishy._core.config import TrainingConfig, ExperimentConfig
from pathlib import Path


def main():
    print("--- Tutorial 03: Configuration Management ---")

    # 1. Single Run Configuration (TrainingConfig)
    # This class holds everything needed for one training session.
    train_cfg = TrainingConfig(
        model="cnn",
        dataset="part",
        epochs=10,
        learning_rate=5e-4,
        batch_size=16,
        data_augmentation=True,  # Enable built-in augmentation
    )

    print("\nTrainingConfig created:")
    print(f"  Model: {train_cfg.model}")
    print(f"  Augmentation: {train_cfg.data_augmentation}")

    # 2. Saving and Loading YAML
    # Configs can be serialized to disk for reproducibility or CLI use.
    yaml_path = "example_config.yaml"
    train_cfg.to_yaml(yaml_path)
    print(f"  Config saved to {yaml_path}")

    # Loading it back
    loaded_cfg = TrainingConfig.from_yaml(yaml_path)
    print(f"  Loaded Model: {loaded_cfg.model}")

    # 3. Batch Configuration (ExperimentConfig)
    # Used to orchestrate multiple models across multiple datasets.
    exp_cfg = ExperimentConfig(
        name="my_first_batch",
        num_runs=5,  # Run each combination 5 times for statistics
        datasets=["species", "oil"],
        models=["cnn", "transformer", "opls-da"],
        benchmark=True,  # Enable performance measuring for all
        overrides={"epochs": 2},  # Force these settings on all runs
    )

    print("\nExperimentConfig created:")
    print(f"  Batch Name: {exp_cfg.name}")
    print(f"  Total combinations: {len(exp_cfg.datasets) * len(exp_cfg.models)}")

    # Clean up
    if Path(yaml_path).exists():
        Path(yaml_path).unlink()


if __name__ == "__main__":
    main()

Training Engines¶

Exploring different ways to train models, from automated orchestration to direct control over the training loop.

# -*- coding: utf-8 -*-
"""
Tutorial 04: Training Engines (High vs. Low Level)
--------------------------------------------------
This tutorial explores different ways to train models, from automated
orchestration to direct control over the training loop.
"""

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from fishy.engine.trainer import Trainer
from fishy._core.config import TrainingConfig
from fishy.experiments.unified_trainer import run_unified_training
from pathlib import Path

PROJECT_ROOT = Path(__file__).resolve().parent.parent


def main():
    print("--- Tutorial 04: Training Engines ---")

    # --- LEVEL 1: Unified Orchestration ---
    # Good for standard experiments and benchmarking.
    print("\n--- Level 1: run_unified_training ---")
    config = TrainingConfig(
        model="transformer",
        dataset="species",
        epochs=2,
        batch_size=32,
        wandb_log=False,
    )
    results = run_unified_training(config)
    print(f"Orchestrated Accuracy: {results.get('val_balanced_accuracy', 0):.4f}")

    # --- LEVEL 2: Direct Trainer usage ---
    # Good for when you have your own model/data but want our optimized loop.
    print("\n--- Level 2: Custom Trainer Control ---")

    # 1. Create a simple PyTorch model
    model = nn.Sequential(nn.Linear(100, 64), nn.ReLU(), nn.Linear(64, 2))

    # 2. Setup your own data
    x = torch.randn(32, 100)
    y = torch.randint(0, 2, (32,))
    loader = DataLoader(TensorDataset(x, y), batch_size=8)

    # 3. Use the Trainer class directly
    trainer = Trainer(
        model=model,
        criterion=nn.CrossEntropyLoss(),
        optimizer=torch.optim.Adam(model.parameters(), lr=1e-3),
        device=torch.device("cpu"),
        num_epochs=2,
    )

    # 4. Execute training
    # This gives you raw access to epoch logs and best model states.
    train_res = trainer.train(loader, val_loader=loader)
    print(f"Manual Trainer Accuracy: {train_res['best_accuracy']:.4f}")


if __name__ == "__main__":
    main()

Automated Benchmarking¶

How to trigger the full automated benchmark suite used in research papers.

# -*- coding: utf-8 -*-
"""
Tutorial 05: Automated Benchmarking (Run All)
---------------------------------------------
This tutorial shows how to trigger the full automated benchmark suite
used in the paper, covering all datasets and model categories.
"""

from fishy.experiments.unified_trainer import run_all_benchmarks


def main():
    print("--- Tutorial 05: Automated Benchmarking ---")

    # The `run_all_benchmarks` function is a "one-button" solution to
    # compare every Classic, Deep, and Evolutionary model against each other.
    # It performs repeated cross-validation and statistical significance tests.

    # For this example, we use `quick=True` to run a very small subset
    # (2 models, 1 dataset, 2 runs) instead of the full 30-run suite.
    print("Launching quick benchmark suite...")

    summary_df = run_all_benchmarks(quick=True, wandb_log=False)

    print("\nBenchmark Summary Table:")
    print(summary_df.to_string())


if __name__ == "__main__":
    main()

Self-Supervised Pre-training¶

Demonstrates how to use unlabeled or semi-labeled data to pre-train a model using various self-supervised tasks.

# -*- coding: utf-8 -*-
"""
Tutorial 06: Self-Supervised Pre-training
-----------------------------------------
This tutorial demonstrates how to use unlabeled or semi-labeled data
to pre-train a model using various self-supervised tasks.
"""

from pathlib import Path
from fishy._core.config import TrainingConfig
from fishy.experiments.deep_training import ModelTrainer

PROJECT_ROOT = Path(__file__).resolve().parent.parent


def main():
    print("--- Tutorial 06: Self-Supervised Pre-training ---")

    # 1. Define pre-training tasks in the config
    # Here we enable Masked Spectra Modelling (MSM) and Denoising (SDA).
    config = TrainingConfig(
        model="transformer",
        dataset="species",
        epochs=5,
        masked_spectra_modelling=True,
        spectrum_denoising_autoencoding=True,
        wandb_log=False,
    )

    # 2. Initialize the high-level ModelTrainer
    trainer = ModelTrainer(config)

    # 3. Run the pre-training phase
    # This will sequentially execute each enabled task, chaining the weights.
    print(f"Starting pre-training tasks for {config.model}...")
    pre_trained_model = trainer.pre_train()

    if pre_trained_model:
        print("\nPre-training successful! Model is now ready for fine-tuning.")

        # 4. Optional: Proceed to fine-tuning with the learned weights
        # results = trainer.train(pre_trained_model)
        # print(f"Fine-tuned Accuracy: {results.get('balanced_accuracy', 0):.4f}")


if __name__ == "__main__":
    main()

Sequential Transfer Learning¶

How to transfer knowledge from one dataset to another sequentially, using different classes/tasks at each stage.

# -*- coding: utf-8 -*-
"""
Tutorial 07: Sequential Transfer Learning
-----------------------------------------
This tutorial shows how to transfer knowledge from one dataset to another
sequentially, using different classes/tasks at each stage.
"""

from pathlib import Path
from fishy.experiments.transfer import run_sequential_transfer_learning

PROJECT_ROOT = Path(__file__).resolve().parent.parent


def main():
    print("--- Tutorial 07: Sequential Transfer Learning ---")

    # We want to train on 'part' (e.g. skin vs fillet)
    # and then transfer that knowledge to 'species' (e.g. hoki vs mackerel).

    # run_sequential_transfer_learning automates:
    # 1. Loading the source dataset
    # 2. Training the model
    # 3. Swapping the classification head
    # 4. Loading the target dataset
    # 5. Fine-tuning the final model

    print("Starting Transfer Learning: [part] -> [species]")

    model, history = run_sequential_transfer_learning(
        model_name="transformer",
        transfer_datasets=["part"],
        target_dataset="species",
        num_epochs_transfer=5,  # Short run for example
        num_epochs_finetune=5,
        batch_size=32,
        wandb_log=False,
    )

    print("\nTransfer Learning complete.")
    print(
        f"Final accuracy on target dataset: {history['finetune']['species']['val_balanced_acc'][-1]:.2f}%"
    )


if __name__ == "__main__":
    main()

Probabilistic Inference¶

Using Bayesian models like Gaussian Processes to get predictions along with uncertainty estimates.

# -*- coding: utf-8 -*-
"""
Tutorial 08: Probabilistic Inference and Uncertainty
----------------------------------------------------
This tutorial demonstrates how to use Bayesian models like Gaussian Processes
to get not just predictions, but also uncertainty estimates.
"""

from fishy._core.config import TrainingConfig
from fishy.experiments.classic_training import run_sklearn_experiment
from fishy.data.module import create_data_module
from sklearn.model_selection import train_test_split
from pathlib import Path
import numpy as np

PROJECT_ROOT = Path(__file__).resolve().parent.parent


def main():
    print("--- Tutorial 08: Probabilistic Inference ---")

    # 1. Configure a Probabilistic Model
    # We'll use Gaussian Process (GP) which is registered in `models.yaml`.
    config = TrainingConfig(
        model="gp",
        dataset="species",
        k_folds=2,  # Fast for example
    )

    # 2. Run the experiment
    print(f"Training {config.model} with uncertainty estimation...")
    stats = run_sklearn_experiment(config, "gp", "species")

    print(f"\nGP Mean Accuracy: {stats['val_balanced_accuracy']:.4f}")

    # 3. Manual Uncertainty Inspection
    # Let's see how we can get uncertainty from the model directly.
    dm = create_data_module("species")
    dm.setup()
    X, y = dm.get_numpy_data(labels_as_indices=True)

    # SHUFFLE to ensure we get both classes in a small subset
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, train_size=50, stratify=y, random_state=42
    )

    # Instantiate the GP class (Scikit-learn wrapper)
    from fishy.models.probabilistic.gp import GP

    model = GP()

    # Fit on the shuffled subset
    model.fit(X_train, y_train)

    # Get uncertainty (1.0 - max_prob)
    uncertainty = model.get_uncertainty(X_test[:5])
    preds = model.predict(X_test[:5])

    print("\nPredictions with Uncertainty (Test Subset):")
    for i, (p, u) in enumerate(zip(preds, uncertainty)):
        print(f"  Sample {i}: Pred={p}, Uncertainty={u:.4f}")


if __name__ == "__main__":
    main()

Outputs and Visualization¶

Where to find experiment results and how to interpret generated artifacts like logs, metrics, and figures.

# -*- coding: utf-8 -*-
"""
Tutorial 09: Outputs and Visualization
--------------------------------------
This tutorial explains where to find the results of your experiments
and how to interpret the generated artifacts.
"""

from fishy._core.utils import RunContext
from pathlib import Path


def main():
    print("--- Tutorial 09: Outputs and Visualization ---")

    # Every time you run an experiment, a `RunContext` is created.
    # It automatically creates a structured output directory:
    # outputs/{dataset}/{method}/{model}_{timestamp}/

    ctx = RunContext(dataset="species", method="deep", model_name="transformer")

    print(f"\nRun directory created at: {ctx.run_dir}")

    # 1. Logs: Found in {run_dir}/logs/experiment.log
    print(f"  Logs:       {ctx.log_dir}")

    # 2. Metrics: Found in {run_dir}/results/metrics.json
    # These are saved using the custom NumpyEncoder.
    print(f"  Results:    {ctx.result_dir}")

    # 3. Figures: Found in {run_dir}/figures/
    # If `figures=True` is set in config, you'll see training curves here.
    print(f"  Figures:    {ctx.figure_dir}")

    # 4. Checkpoints: Found in {run_dir}/checkpoints/
    # Best model weights are saved here during training.
    print(f"  Checkpoints: {ctx.checkpoint_dir}")

    print("\nTo generate these automatically during a run, ensure your config has:")
    print("  config.benchmark = True")
    print("  config.figures = True")


if __name__ == "__main__":
    main()