Experimental Results LeaderboardΒΆ
This page provides an interactive view of results for 16 models across 4 datasets, aggregated from over 1900 individual runs.
[1]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
from pathlib import Path
import os
# Set renderer for Sphinx/ReadTheDocs compatibility
pio.renderers.default = 'notebook'
# Load Summarized and Raw data
base_path = Path('_static') if Path('_static').exists() else Path('.') / '_static'
sum_path = base_path / 'leaderboard_data.csv'
raw_path = base_path / 'leaderboard_raw.csv'
df = pd.read_csv(sum_path) if sum_path.exists() else pd.DataFrame()
df_raw = pd.read_csv(raw_path) if raw_path.exists() else pd.DataFrame()
if not df.empty:
# Clean up significance columns: replace NaNs (from empty spaces in CSV) with blank
df['Sig Tr'] = df['Sig Tr'].fillna(' ')
df['Sig Te'] = df['Sig Te'].fillna(' ')
for col in ['Train', 'Test', 'Test Std', 'Train Std']:
if col in df.columns:
df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0.0)
def get_color_map(methods):
colors = px.colors.qualitative.Plotly + px.colors.qualitative.Bold
unique_methods = sorted(list(set(methods)))
return {m: colors[i % len(colors)] for i, m in enumerate(unique_methods)}
color_map = get_color_map(df['Method'].unique()) if not df.empty else {}
π Statistical Summary TableΒΆ
The table below summarizes the mean performance across 30 runs.
Note on Significance: The symbols +, -, and β indicate statistical significance compared to the OPLS-DA baseline using a paired t-test (p < 0.05).
Symbol |
Meaning |
|---|---|
|
Significantly better than OPLS-DA |
|
Significantly worse than OPLS-DA |
|
No significant difference |
[2]:
if not df.empty:
# Select and rename columns as requested
cols = ['Dataset', 'Method', 'Train', 'Train Std', 'Sig Tr', 'Test', 'Test Std', 'Sig Te', 'Runtime']
actual_cols = [c for c in cols if c in df.columns]
pdf = df[actual_cols].sort_values(['Dataset', 'Test'], ascending=[True, False])
fig = go.Figure(data=[go.Table(
header=dict(values=[f"<b>{c}</b>" for c in pdf.columns], fill_color='paleturquoise', align='left'),
cells=dict(values=[pdf[c] for c in pdf.columns],
format=[None, None, '.4f', '.4f', None, '.4f', '.4f', None, '.2f'],
fill_color='lavender', align='left'))
])
fig.update_layout(margin=dict(l=0, r=0, t=0, b=0), height=800)
fig.show()
else:
print("No data available for summary table.")
π Performance by DatasetΒΆ
The bar charts below show the mean balanced accuracy for each method, with error bars representing one standard deviation across runs.
[3]:
if not df.empty:
for ds in df['Dataset'].unique():
ds_df = df[df['Dataset'] == ds].sort_values('Test', ascending=False)
fig = px.bar(
ds_df, x='Method', y='Test', error_y='Test Std' if 'Test Std' in ds_df.columns else None,
title=f"Leaderboard: {ds.upper()}",
color='Method',
color_discrete_map=color_map,
template='plotly_white',
labels={'Test': 'Mean Balanced Accuracy'}
)
fig.update_layout(yaxis_range=[0, 1.05])
fig.show()
else:
print("No results available for bar charts.")
π‘οΈ Stability Analysis (Box Plots)ΒΆ
Visualizing the variance across all 30 runs for each model. Charts are sorted by descending mean performance.
[4]:
if not df_raw.empty and not df.empty:
for ds in df_raw['Dataset'].unique():
# Get the sorted order from the summary dataframe for this dataset
sorted_methods = df[df['Dataset'] == ds].sort_values('Test', ascending=False)['Method'].tolist()
ds_raw = df_raw[df_raw['Dataset'] == ds].copy()
fig = px.box(ds_raw, x='Method', y='Test Accuracy', color='Method',
color_discrete_map=color_map, points='all', template='plotly_white',
category_orders={'Method': sorted_methods},
title=f"Stability Distribution: {ds.upper()}")
fig.show()
else:
print("No raw data available for stability plots.")
π Global Performance HeatmapΒΆ
A birdβs-eye view of how all models perform across all datasets.
[5]:
if not df.empty:
pivot_df = df.pivot(index='Method', columns='Dataset', values='Test')
fig = px.imshow(pivot_df, text_auto=".3f", aspect="auto", color_continuous_scale='Viridis',
title="Model Generalization across Datasets", template='plotly_white')
fig.show()
else:
print("No data available for heatmap.")
π― Efficiency FrontierΒΆ
Comparison of Training vs Testing accuracy. Ideally, models should be in the top-right corner with a small gap.
[6]:
if not df.empty:
fig = px.scatter(df, x="Test", y="Train", size="Test Std", color="Method",
facet_col="Dataset", hover_name="Method", color_discrete_map=color_map,
template="plotly_white", title="Training vs Testing Performance")
fig.update_layout(height=400)
fig.show()
else:
print("No data available for efficiency frontier.")
π Top 3 Methods Comparison (Radar Chart)ΒΆ
This chart compares the top 3 best-performing models for each dataset across multiple metrics.
[7]:
if not df.empty:
for ds in df['Dataset'].unique():
ds_df = df[df['Dataset'] == ds].sort_values('Test', ascending=False).head(3)
fig = go.Figure()
for _, row in ds_df.iterrows():
fig.add_trace(go.Scatterpolar(
r=[row['Train'], row['Test'], row['Test'] - row['Test Std'], row['Test'] + row['Test Std']],
theta=['Train Acc', 'Test Acc', 'Lower Bound', 'Upper Bound'],
fill='toself', name=row['Method'],
line=dict(color=color_map.get(row['Method']))
))
fig.update_layout(
polar=dict(radialaxis=dict(visible=True, range=[0, 1])),
title=f"Top 3 Profile: {ds.upper()}", template='plotly_white'
)
fig.show()
else:
print("No data available for radar charts.")