Examples
This page provides complete examples for common FTIR preprocessing workflows.
Example 1: Notebook-Style Method Selection
This mirrors the method-selection workflow used in the notebooks.
from xpectrass import FTIRdataprocessing, load_villegas_camacho_2024_c4
dataset = load_villegas_camacho_2024_c4()
fdp = FTIRdataprocessing(
df=dataset,
label_column="type",
flat_windows=[(1880, 1900), (2400, 2700)],
)
# Convert first (transmittance -> absorbance)
df_abs = fdp.convert(plot=False)
# 1) Pick denoising method
denoise_eval = fdp.find_denoising_method(
data=df_abs,
methods="FTIR",
n_samples=50,
plot=False,
)
best_denoise = fdp.best_denoising_methods(eval_df=denoise_eval, top_n=5)
denoise_method = best_denoise.iloc[0]["method"]
# 2) Pick baseline correction method
df_denoised = fdp._get_denoised_data(denoising_method=denoise_method, plot=False)
rfzn, nar, snr = fdp.find_baseline_method(
data=df_denoised,
flat_windows=[(1880, 1900), (2400, 2700)],
baseline_methods="FTIR",
n_samples=50,
plot=False,
)
best_baseline = fdp.best_baseline_method(
rfzn_tbl=rfzn,
nar_tbl=nar,
snr_tbl=snr,
top_n=5,
)
baseline_method = best_baseline.iloc[0]["method"]
print("Best denoising:", denoise_method)
print("Best baseline:", baseline_method)
Example 2 Atmospheric + Normalization Workflow
# Continue from Example 8
df_atm = fdp._get_atmosphere_corrected_data(
denoising_method=denoise_method,
baseline_correction_method=baseline_method,
interpolate_method="zero",
plot=False,
)
# Rank normalization methods by classification-oriented metrics
norm_scores = fdp.find_normalization_method(
data=df_atm,
methods="FTIR",
n_splits=5,
)
normalization_method = norm_scores.iloc[0]["method"]
# Run full preprocessing with selected methods
df_norm = fdp._get_normalized_data(
denoising_method=denoise_method,
baseline_correction_method=baseline_method,
interpolate_method="zero",
normalization_method=normalization_method,
plot=False,
)
df_norm.to_excel("DenoisedBaselineAtmosphericCorrectedNormalizedData.xlsx", index=False)
print("Saved normalized data.")
Example 3: Derivatives + Multi-Dataset Combination
from xpectrass import FTIRdataprocessing, load_all_datasets, combine_datasets
all_sets = load_all_datasets()
common_kwargs = dict(
label_column="type",
exclude_regions=[(0, 680), (3500, 5000)],
interpolate_regions=[(1250, 2700)],
flat_windows=[(1880, 1900), (2400, 2700)],
)
fdp_jung = FTIRdataprocessing(df=all_sets["jung_2018"], **common_kwargs)
jung_norm = fdp_jung._get_normalized_data(
denoising_method="wavelet",
baseline_correction_method="aspls",
interpolate_method="zero",
normalization_method="snv_detrend",
plot=False,
)
fdp_frond = FTIRdataprocessing(df=all_sets["frond_2021"], **common_kwargs)
frond_norm = fdp_frond._get_normalized_data(
denoising_method="wavelet",
baseline_correction_method="aspls",
interpolate_method="zero",
normalization_method="snv_detrend",
plot=False,
)
# Derivatives
jung_d1 = fdp_jung.derivatives(data=jung_norm, order=1, window_length=15, polyorder=3, delta=1.0, plot=False)
jung_d2 = fdp_jung.derivatives(data=jung_norm, order=2, window_length=15, polyorder=3, delta=1.0, plot=False)
# Combine normalized datasets on a common grid
combined_norm, grid = combine_datasets(
datasets=[jung_norm, frond_norm],
wn_min=680,
wn_max=3000,
resolution=2.0,
descending=True,
method="pchip",
label_column="type",
add_study_column=True,
study_names=["jung_2018", "frond_2021"],
show_progress=True,
n_jobs=4,
data_mode="normalized",
)
combined_norm.to_csv("processed_data/combined_norm_data.csv.xz", compression="xz", index=None)
print("Combined shape:", combined_norm.shape)
Example 4: Notebook-Style Data Analysis Class
import pandas as pd
from xpectrass import FTIRdataanalysis
df = pd.read_csv("processed_data/combined_deriv1_data.csv.xz", compression="xz")
df = df[df["study"] != "kedzierski_2019_u"]
fda = FTIRdataanalysis(
df=df,
dataset_name="Combined dataset",
label_column="type",
exclude_columns=["study", "sample_id", "environmental", "resolution"],
random_state=42,
n_jobs=-1,
)
fda.plot_pca(standardize=True, handle_missing="zero", figsize=(12, 8))
fda.plot_umap(
n_neighbors=100,
min_dist=0.5,
pca_components=20,
standardize=True,
handle_missing="zero",
figsize=(12, 8),
)
data_dict = fda.ml_prepare_data(test_size=0.2)
single = fda.run_a_model(model_name="XGBoost (100)", cv_folds=5, plot_confusion=True)
all_results = fda.run_all_models(plot_comparison=True, accuracy_threshold=0.9, top_n_methods=20)