Examples

This page provides complete examples for common FTIR preprocessing workflows.

Example 1: Notebook-Style Method Selection

This mirrors the method-selection workflow used in the notebooks.

from xpectrass import FTIRdataprocessing, load_villegas_camacho_2024_c4

dataset = load_villegas_camacho_2024_c4()
fdp = FTIRdataprocessing(
    df=dataset,
    label_column="type",
    flat_windows=[(1880, 1900), (2400, 2700)],
)

# Convert first (transmittance -> absorbance)
df_abs = fdp.convert(plot=False)

# 1) Pick denoising method
denoise_eval = fdp.find_denoising_method(
    data=df_abs,
    methods="FTIR",
    n_samples=50,
    plot=False,
)
best_denoise = fdp.best_denoising_methods(eval_df=denoise_eval, top_n=5)
denoise_method = best_denoise.iloc[0]["method"]

# 2) Pick baseline correction method
df_denoised = fdp._get_denoised_data(denoising_method=denoise_method, plot=False)
rfzn, nar, snr = fdp.find_baseline_method(
    data=df_denoised,
    flat_windows=[(1880, 1900), (2400, 2700)],
    baseline_methods="FTIR",
    n_samples=50,
    plot=False,
)
best_baseline = fdp.best_baseline_method(
    rfzn_tbl=rfzn,
    nar_tbl=nar,
    snr_tbl=snr,
    top_n=5,
)
baseline_method = best_baseline.iloc[0]["method"]

print("Best denoising:", denoise_method)
print("Best baseline:", baseline_method)

Example 2 Atmospheric + Normalization Workflow

# Continue from Example 8
df_atm = fdp._get_atmosphere_corrected_data(
    denoising_method=denoise_method,
    baseline_correction_method=baseline_method,
    interpolate_method="zero",
    plot=False,
)

# Rank normalization methods by classification-oriented metrics
norm_scores = fdp.find_normalization_method(
    data=df_atm,
    methods="FTIR",
    n_splits=5,
)
normalization_method = norm_scores.iloc[0]["method"]

# Run full preprocessing with selected methods
df_norm = fdp._get_normalized_data(
    denoising_method=denoise_method,
    baseline_correction_method=baseline_method,
    interpolate_method="zero",
    normalization_method=normalization_method,
    plot=False,
)

df_norm.to_excel("DenoisedBaselineAtmosphericCorrectedNormalizedData.xlsx", index=False)
print("Saved normalized data.")

Example 3: Derivatives + Multi-Dataset Combination

from xpectrass import FTIRdataprocessing, load_all_datasets, combine_datasets

all_sets = load_all_datasets()

common_kwargs = dict(
    label_column="type",
    exclude_regions=[(0, 680), (3500, 5000)],
    interpolate_regions=[(1250, 2700)],
    flat_windows=[(1880, 1900), (2400, 2700)],
)

fdp_jung = FTIRdataprocessing(df=all_sets["jung_2018"], **common_kwargs)
jung_norm = fdp_jung._get_normalized_data(
    denoising_method="wavelet",
    baseline_correction_method="aspls",
    interpolate_method="zero",
    normalization_method="snv_detrend",
    plot=False,
)

fdp_frond = FTIRdataprocessing(df=all_sets["frond_2021"], **common_kwargs)
frond_norm = fdp_frond._get_normalized_data(
    denoising_method="wavelet",
    baseline_correction_method="aspls",
    interpolate_method="zero",
    normalization_method="snv_detrend",
    plot=False,
)

# Derivatives
jung_d1 = fdp_jung.derivatives(data=jung_norm, order=1, window_length=15, polyorder=3, delta=1.0, plot=False)
jung_d2 = fdp_jung.derivatives(data=jung_norm, order=2, window_length=15, polyorder=3, delta=1.0, plot=False)

# Combine normalized datasets on a common grid
combined_norm, grid = combine_datasets(
    datasets=[jung_norm, frond_norm],
    wn_min=680,
    wn_max=3000,
    resolution=2.0,
    descending=True,
    method="pchip",
    label_column="type",
    add_study_column=True,
    study_names=["jung_2018", "frond_2021"],
    show_progress=True,
    n_jobs=4,
    data_mode="normalized",
)

combined_norm.to_csv("processed_data/combined_norm_data.csv.xz", compression="xz", index=None)
print("Combined shape:", combined_norm.shape)

Example 4: Notebook-Style Data Analysis Class

import pandas as pd
from xpectrass import FTIRdataanalysis

df = pd.read_csv("processed_data/combined_deriv1_data.csv.xz", compression="xz")
df = df[df["study"] != "kedzierski_2019_u"]

fda = FTIRdataanalysis(
    df=df,
    dataset_name="Combined dataset",
    label_column="type",
    exclude_columns=["study", "sample_id", "environmental", "resolution"],
    random_state=42,
    n_jobs=-1,
)

fda.plot_pca(standardize=True, handle_missing="zero", figsize=(12, 8))
fda.plot_umap(
    n_neighbors=100,
    min_dist=0.5,
    pca_components=20,
    standardize=True,
    handle_missing="zero",
    figsize=(12, 8),
)

data_dict = fda.ml_prepare_data(test_size=0.2)
single = fda.run_a_model(model_name="XGBoost (100)", cv_folds=5, plot_confusion=True)
all_results = fda.run_all_models(plot_comparison=True, accuracy_threshold=0.9, top_n_methods=20)