Source code for xpectrass.utils.file_management

from __future__ import annotations
from pathlib import Path
from typing import Iterable, Literal
from pathlib import Path
import pandas as pd
import polars as pl
import itertools
from typing import Union, Optional
import tqdm


##############################################################################
# 1.  Import helpers (assumed to exist in your code base)                    #
##############################################################################
# - import_data(file)           → pandas  (single sample, index="wavenumber")
# - import_data_pl(file)        → polars  (same schema)
##############################################################################


[docs]
def process_batch_files(
    files: Iterable[Union[str, Path]],
    skiprows: int = 15,
    separator: str = ',',
    engine: Literal["pd", "pl"] = "pl",
    concat_how: Literal["vertical", "vertical_relaxed"] = "vertical",
    keep_index: bool = True,
    index_col: Optional[str] = None,
    show_progress: bool = True,
) -> pl.DataFrame:
    """
    Import a batch of FT-IR CSVs and concatenate them into one Polars frame.

    Parameters
    ----------
    files : iterable of str | Path
        Paths to the spectral CSV files.
    skiprows : int, default 15
        Number of rows to skip at the start of the file (e.g. metadata).
    separator : str, default ','
        Delimiter for the CSV file.
    engine : {'pd', 'pl'}, default 'pl'
        • 'pd'  → read via the pandas-based importer, then convert to Polars.
        • 'pl'  → read directly via the Polars importer (faster).
    concat_how : {'vertical', 'vertical_relaxed'}, default 'vertical'
        • 'vertical'          → schemas must match exactly; raises if not.
        • 'vertical_relaxed'  → union by column name; missing cols filled with
                                 nulls (Polars ≥ 0.20).
    keep_index : bool, default True
        If True and *engine* is 'pd', include the pandas Index as a column
        when converting to Polars (`include_index=True`).  Recommended because
        your importer names the index "sample".
    index_col : str, optional
        Column name to use as the row identifier/index.
        If None, uses default behavior (integer index for pandas, "sample" column for polars).
        Common values: 'sample', 'sample_name', etc.
    show_progress : bool, default True
        Toggle the tqdm progress bar.

    Returns
    -------
    pl.DataFrame
        All spectra stacked row-wise (each row = one sample).
    """
    dfs: list[pl.DataFrame] = []

    iterator = tqdm.tqdm(files, desc="Importing", disable=not show_progress)
    for file in iterator:
        file = Path(file)                       # normalise
        if engine == "pd":
            df_pd: pd.DataFrame = import_data_pd(file, skiprows, separator, index_col)
            # If index_col was set, the column is now in the index
            # We need to include it when converting to Polars
            if index_col is not None:
                df_pl = pl.from_pandas(df_pd, include_index=True)
            else:
                df_pl = pl.from_pandas(df_pd, include_index=keep_index)
        elif engine == "pl":
            df_pl = import_data_pl(file, skiprows, separator, index_col)
        else:
            raise ValueError("engine must be 'pd' or 'pl'")

        dfs.append(df_pl)

    if not dfs:
        raise ValueError("No DataFrames were created from the provided file list.")

    # Concatenate; 'vertical_relaxed' aligns by header
    try:
        final_df = pl.concat(dfs, how=concat_how, rechunk=True)
    except pl.ColumnNotFoundError as e:
        raise ValueError(
            "Column mis-match across files. "
            "Try concat_how='vertical_relaxed' or inspect individual schemas."
        ) from e

    return final_df



# import data


[docs]
def import_data(
    file_path: Union[str, Path],
    engine: str = 'pl',
    skiprows: int = 15,
    separator: str = ',',
    index_col: Optional[str] = None
):
    """
    Load a single‐sample CSV of spectral data, set the wavenumber index,
    transpose so samples are rows, and attach a simple sample label.

    Parameters
    ----------
    file_path : str or pathlib.Path
        Path to the CSV file to import.
    skiprows : int, default 15
        Number of rows to skip at the start of the file (e.g. metadata).
    separator : str, default ','
        Delimiter for the CSV file.
    engine : str, default 'pl'
        'pd' for pandas or 'pl' for polars.
    index_col : str, optional
        Column name to use as the DataFrame's row index.
        If None, uses default integer index.
        Common values: 'sample', 'sample_name', etc.

    Returns
    -------
    pd.DataFrame | pl.DataFrame
        Transposed DataFrame (`samples` × `wavenumber`) with:
        - Index name = "sample" (pandas) or "sample" column (polars)
        - Column name = wavenumber values, index name = "wavenumber"
        - A `"label"` column containing the alphabetic prefix of the sample name
    """
    if engine == 'pd':
        df = import_data_pd(file_path, skiprows, separator, index_col)
    else:
        df = import_data_pl(file_path, skiprows, separator, index_col)
    return df


# Import data using Pandas


[docs]
def import_data_pd(
    file_path: Union[str, Path],
    skiprows: int = 15,
    sep: str = ',',
    index_col: Optional[str] = None
) -> pd.DataFrame:
    """
    Load a single‐sample CSV of spectral data, set the wavenumber index,
    transpose so samples are rows, and attach a simple sample label.

    Parameters
    ----------
    file_path : str or pathlib.Path
        Path to the CSV file to import.
    skiprows : int, default 15
        Number of rows to skip at the start of the file (e.g. metadata).
    sep : str, default ','
        Delimiter for the CSV file.
    index_col : str, optional
        Column name to use as the DataFrame's row index after transposing.
        If None, uses default integer index with name "sample".
        Common values: 'sample', 'sample_name', etc.

    Returns
    -------
    pd.DataFrame
        Transposed DataFrame (`samples` × `wavenumber`) with:
        - Index name = index_col (if provided) or "sample" (default)
        - Column names = wavenumber values
        - A `"label"` column containing the alphabetic prefix of the sample name
    """
    # Ensure we have a Path object
    file_path = Path(file_path)

    # Read CSV and use first column as the wavenumber index
    df = pd.read_csv(
        file_path,
        skiprows=skiprows,
        index_col=0,
        header=None,
        sep=sep
    )
    df.index.name = "wavenumber"

    # Derive sample name from filename (stem = filename without suffix)
    sample_name = file_path.stem
    df.columns = [sample_name]

    # Transpose so each sample is a row
    df_transposed = df.T
    df_transposed.index.name = "sample"

    # Create a simple label: the prefix of letters before any digits
    letters = itertools.takewhile(lambda ch: not ch.isdigit(), sample_name)
    sample_label = ''.join(letters)
    df_transposed["label"] = sample_label

    # Reorder columns to put "label" at the beginning (consistent with polars version)
    cols = df_transposed.columns.tolist()
    # Move "label" to the front
    cols = ["label"] + [c for c in cols if c != "label"]
    df_transposed = df_transposed[cols]

    # Set index to specified column if requested
    if index_col is not None:
        if index_col in df_transposed.columns:
            df_transposed = df_transposed.set_index(index_col)
        else:
            raise ValueError(
                f"Column '{index_col}' not found in DataFrame. "
                f"Available columns: {df_transposed.columns.tolist()}"
            )

    return df_transposed



# Import data using Polars



[docs]
def import_data_pl(
    file_path: Union[str, Path],
    skiprows: int = 15,
    sep: str = ",",
    index_col: Optional[str] = None
) -> pl.DataFrame:
    """
    Load a single-sample spectral CSV into a Polars DataFrame, reshape it so that
    each sample is a row with wavenumbers as columns, and attach a simple sample label.

    Parameters
    ----------
    file_path : str or pathlib.Path
        Path to the CSV file.
    skiprows : int, default 15
        Number of lines to skip before reading data (e.g., metadata).
    sep : str, default ","
        Field delimiter for the CSV.
    index_col : str, optional
        Column name to use as the row identifier.
        If provided, this column will be moved to the first position and can
        be used for setting as index when converting to pandas.
        If None, "sample" column remains as a regular column.
        Common values: 'sample', 'sample_name', etc.

    Returns
    -------
    pl.DataFrame
        A wide DataFrame where:
          - Each row is a sample.
          - If index_col is specified, that column is in the first position.
          - Columns are wavenumbers (as floats or ints).
          - A "label" column holds the alphabetic prefix of the sample name.
    """
    # Normalize path and extract sample name
    file_path = Path(file_path)
    sample_name = file_path.stem

    # Read the CSV file without header, and assign columns
    new_columns = ["wavenumber", sample_name]
    df = pl.read_csv(
        source=str(file_path),
        skip_rows=skiprows,
        has_header=False,
        separator=sep,
        new_columns=new_columns
    )

    # Add a "sample" column for pivoting
    df = df.with_columns([
        pl.lit(sample_name).alias("sample")
    ])

    # Pivot into wide form: one row per sample, columns are wavenumbers
    df_wide = df.pivot(
        values=sample_name,
        index="sample",
        columns="wavenumber",
    )

    # Derive the "label" (alphabetic prefix before digits) and append
    prefix = "".join(itertools.takewhile(lambda ch: not ch.isdigit(), sample_name))
    df_wide = df_wide.with_columns([
        pl.lit(prefix).alias("label")
    ])

    # Reorder columns based on index_col parameter
    cols = df_wide.columns
    if index_col is not None:
        if index_col not in cols:
            raise ValueError(
                f"Column '{index_col}' not found in DataFrame. "
                f"Available columns: {cols}"
            )
        # Put index_col first, then other metadata columns, then spectral columns
        # Identify metadata columns (non-numeric, parseable column names)
        metadata_cols = []
        spectral_cols = []
        for c in cols:
            if c == index_col:
                continue  # Will be added first
            try:
                float(c)
                spectral_cols.append(c)
            except (ValueError, TypeError):
                metadata_cols.append(c)

        # Order: index_col, then other metadata, then spectral
        ordered = [index_col] + metadata_cols + spectral_cols
    else:
        # Default: "sample" and "label" come first, then spectral columns
        ordered = ["sample", "label"] + [c for c in cols if c not in ("sample", "label")]

    return df_wide.select(ordered)