Experimental Results Leaderboard¶

This page provides an interactive view of results for 16 models across 4 datasets, aggregated from over 1900 individual runs.

[1]:

import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
from pathlib import Path
import os

# Set renderer for Sphinx/ReadTheDocs compatibility
pio.renderers.default = 'notebook'

# Load Summarized and Raw data
base_path = Path('_static') if Path('_static').exists() else Path('.') / '_static'
sum_path = base_path / 'leaderboard_data.csv'
raw_path = base_path / 'leaderboard_raw.csv'

df = pd.read_csv(sum_path) if sum_path.exists() else pd.DataFrame()
df_raw = pd.read_csv(raw_path) if raw_path.exists() else pd.DataFrame()

if not df.empty:
    # Clean up significance columns: replace NaNs (from empty spaces in CSV) with blank
    df['Sig Tr'] = df['Sig Tr'].fillna(' ')
    df['Sig Te'] = df['Sig Te'].fillna(' ')

    for col in ['Train', 'Test', 'Test Std', 'Train Std']:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0.0)

def get_color_map(methods):
    colors = px.colors.qualitative.Plotly + px.colors.qualitative.Bold
    unique_methods = sorted(list(set(methods)))
    return {m: colors[i % len(colors)] for i, m in enumerate(unique_methods)}

color_map = get_color_map(df['Method'].unique()) if not df.empty else {}

📊 Statistical Summary Table¶

The table below summarizes the mean performance across 30 runs.

Note on Significance: The symbols +, -, and ≈ indicate statistical significance compared to the OPLS-DA baseline using a paired t-test (p < 0.05).

Symbol	Meaning
`+`	Significantly better than OPLS-DA
`-`	Significantly worse than OPLS-DA
`≈`	No significant difference

[2]:

if not df.empty:
    # Select and rename columns as requested
    cols = ['Dataset', 'Method', 'Train', 'Train Std', 'Sig Tr', 'Test', 'Test Std', 'Sig Te', 'Runtime']
    actual_cols = [c for c in cols if c in df.columns]
    pdf = df[actual_cols].sort_values(['Dataset', 'Test'], ascending=[True, False])

    fig = go.Figure(data=[go.Table(
        header=dict(values=[f"<b>{c}</b>" for c in pdf.columns], fill_color='paleturquoise', align='left'),
        cells=dict(values=[pdf[c] for c in pdf.columns],
                   format=[None, None, '.4f', '.4f', None, '.4f', '.4f', None, '.2f'],
                   fill_color='lavender', align='left'))
    ])
    fig.update_layout(margin=dict(l=0, r=0, t=0, b=0), height=800)
    fig.show()
else:
    print("No data available for summary table.")

📊 Performance by Dataset¶

The bar charts below show the mean balanced accuracy for each method, with error bars representing one standard deviation across runs.

[3]:

if not df.empty:
    for ds in df['Dataset'].unique():
        ds_df = df[df['Dataset'] == ds].sort_values('Test', ascending=False)
        fig = px.bar(
            ds_df, x='Method', y='Test', error_y='Test Std' if 'Test Std' in ds_df.columns else None,
            title=f"Leaderboard: {ds.upper()}",
            color='Method',
            color_discrete_map=color_map,
            template='plotly_white',
            labels={'Test': 'Mean Balanced Accuracy'}
        )
        fig.update_layout(yaxis_range=[0, 1.05])
        fig.show()
else:
    print("No results available for bar charts.")

🛡️ Stability Analysis (Box Plots)¶

Visualizing the variance across all 30 runs for each model. Charts are sorted by descending mean performance.

[4]:

if not df_raw.empty and not df.empty:
    for ds in df_raw['Dataset'].unique():
        # Get the sorted order from the summary dataframe for this dataset
        sorted_methods = df[df['Dataset'] == ds].sort_values('Test', ascending=False)['Method'].tolist()

        ds_raw = df_raw[df_raw['Dataset'] == ds].copy()

        fig = px.box(ds_raw, x='Method', y='Test Accuracy', color='Method',
                     color_discrete_map=color_map, points='all', template='plotly_white',
                     category_orders={'Method': sorted_methods},
                     title=f"Stability Distribution: {ds.upper()}")
        fig.show()
else:
    print("No raw data available for stability plots.")

📈 Global Performance Heatmap¶

A bird’s-eye view of how all models perform across all datasets.

[5]:

if not df.empty:
    pivot_df = df.pivot(index='Method', columns='Dataset', values='Test')
    fig = px.imshow(pivot_df, text_auto=".3f", aspect="auto", color_continuous_scale='Viridis',
                    title="Model Generalization across Datasets", template='plotly_white')
    fig.show()
else:
    print("No data available for heatmap.")

🎯 Efficiency Frontier¶

Comparison of Training vs Testing accuracy. Ideally, models should be in the top-right corner with a small gap.

[6]:

if not df.empty:
    fig = px.scatter(df, x="Test", y="Train", size="Test Std", color="Method",
                     facet_col="Dataset", hover_name="Method", color_discrete_map=color_map,
                     template="plotly_white", title="Training vs Testing Performance")
    fig.update_layout(height=400)
    fig.show()
else:
    print("No data available for efficiency frontier.")

🏆 Top 3 Methods Comparison (Radar Chart)¶

This chart compares the top 3 best-performing models for each dataset across multiple metrics.

[7]:

if not df.empty:
    for ds in df['Dataset'].unique():
        ds_df = df[df['Dataset'] == ds].sort_values('Test', ascending=False).head(3)
        fig = go.Figure()
        for _, row in ds_df.iterrows():
            fig.add_trace(go.Scatterpolar(
                r=[row['Train'], row['Test'], row['Test'] - row['Test Std'], row['Test'] + row['Test Std']],
                theta=['Train Acc', 'Test Acc', 'Lower Bound', 'Upper Bound'],
                fill='toself', name=row['Method'],
                line=dict(color=color_map.get(row['Method']))
            ))
        fig.update_layout(
            polar=dict(radialaxis=dict(visible=True, range=[0, 1])),
            title=f"Top 3 Profile: {ds.upper()}", template='plotly_white'
        )
        fig.show()
else:
    print("No data available for radar charts.")