Source code for towbintools.foundation.file_handling

import os
import re

import numpy as np
import polars as pl


[docs] def extract_time_point( path: str, time_regex: str = r"Time(\d+)", point_regex: str = r"Point(\d+)", ) -> tuple[int, int]: r""" Extract time and point information from a file name using regular expressions. Parameters: path (str): The file path or name to extract information from. time_regex (str): Regular expression pattern to extract time information. (default: r"Time(\d+)") point_regex (str): Regular expression pattern to extract point information. (default: r"Point(\d+)") Returns: tuple: A tuple containing the extracted time and point as integers. """ time_pattern = re.compile(time_regex) point_pattern = re.compile(point_regex) time_match = time_pattern.search(path) point_match = point_pattern.search(path) if time_match and point_match: time = int(time_match.group(1)) point = int(point_match.group(1)) return time, point else: raise ValueError("Could not extract time and point from file name.")
[docs] def get_all_timepoints_from_dir( dir_path: str, time_regex: str = r"Time(\d+)", point_regex: str = r"Point(\d+)", ) -> list[dict]: r""" Retrieve all time points and corresponding image paths from a directory. Parameters: dir_path (str): The path to the directory containing the images. time_regex (str): Regular expression pattern to extract time information from file names. (default: r"Time(\d+)") point_regex (str): Regular expression pattern to extract point information from file names. (default: r"Point(\d+)") Returns: list: A list of dictionaries, each containing the time, point, and image path. """ time_pattern = re.compile(time_regex) point_pattern = re.compile(point_regex) timepoint_list = [] # Get a list of file paths in the directory (excluding subdirectories) image_paths = [ os.path.join(dir_path, x) for x in os.listdir(dir_path) if not os.path.isdir(os.path.join(dir_path, x)) ] for image_path in image_paths: time_match = time_pattern.search(image_path) point_match = point_pattern.search(image_path) # Only add to list if both time and point are found if time_match and point_match: time = int(time_match.group(1)) point = int(point_match.group(1)) timepoint_list.append( {"Time": time, "Point": point, "ImagePath": image_path} ) return timepoint_list
[docs] def fill_empty_timepoints( filemap: pl.DataFrame, ) -> pl.DataFrame: """ Fill in missing time points in a filemap dataframe with empty image paths. Parameters: filemap (pl.DataFrame): The filemap dataframe containing 'Time', 'Point', and 'ImagePath' columns. Returns: pl.DataFrame: The filled filemap dataframe with missing time points included. """ all_points = ( filemap.select(pl.col("Point")).unique(maintain_order=True).to_numpy().squeeze() ) if all_points.ndim == 0: all_points = np.array([all_points]) all_times = ( filemap.select(pl.col("Time")).unique(maintain_order=True).to_numpy().squeeze() ) if all_times.ndim == 0: all_times = np.array([all_times]) missing_times = [] for point in all_points: # Get the unique times associated with the current point. times_of_point = ( filemap.filter(pl.col("Point") == point) .select(pl.col("Time")) .to_numpy() .squeeze() ) missing = set(all_times) - set(times_of_point) missing_times.extend( [{"Time": time, "Point": point, "ImagePath": ""} for time in missing] ) if missing_times: filemap_extended = pl.DataFrame(missing_times) filled_filemap = pl.concat([filemap, filemap_extended]).sort(["Point", "Time"]) else: filled_filemap = filemap.sort(["Point", "Time"]) return filled_filemap
[docs] def get_dir_filemap( dir_path: str, time_regex: str = r"Time(\d+)", point_regex: str = r"Point(\d+)", ) -> pl.DataFrame: r""" Get the filemap dataframe for a directory by retrieving all time points and filling in missing time points. Parameters: dir_path (str): The path to the directory containing the images. time_regex (str): Regular expression pattern to extract time information from file names. (default: r"Time(\d+)") point_regex (str): Regular expression pattern to extract point information from file names. (default: r"Point(\d+)") Returns: pl.DataFrame: The filemap dataframe with 'Time', 'Point', and 'ImagePath' columns. """ timepoint_list = get_all_timepoints_from_dir(dir_path, time_regex, point_regex) filemap = pl.DataFrame(timepoint_list) filled_filemap = fill_empty_timepoints(filemap) return filled_filemap
[docs] def get_experiment_dir_filemap( dir_path: str, raw_dir: str = "raw", analysis_dir: str = "analysis", time_regex: str = r"Time(\d+)", point_regex: str = r"Point(\d+)", ) -> pl.DataFrame: r""" Get the filemap dataframe for an experiment directory. Retrieves time points from the 'raw' directory and fills in missing ones then adds paths from the 'analysis' subdirectories. Parameters: dir_path (str): Base directory path for the experiment. raw_dir (str): Subdirectory name for raw images. (default: "raw") analysis_dir (str): Subdirectory name for analysis output. (default: "analysis") time_regex (str): Regular expression pattern to extract time information from file names. (default: r"Time(\d+)") point_regex (str): Regular expression pattern to extract point information from file names. (default: r"Point(\d+)") Returns: pl.DataFrame: Extended filemap dataframe including both raw and analysis image paths. """ raw_timepoint_list = get_all_timepoints_from_dir( os.path.join(dir_path, raw_dir), time_regex, point_regex ) raw_filemap = pl.DataFrame(raw_timepoint_list) experiment_filemap = fill_empty_timepoints(raw_filemap) experiment_filemap.rename({"ImagePath": raw_dir}) analysis_dir = os.path.join(dir_path, analysis_dir) if os.path.exists(analysis_dir): subdir_list = [x[0] for x in os.walk(analysis_dir)] for subdir in subdir_list: if subdir != analysis_dir: timepoint_list = get_all_timepoints_from_dir( subdir, time_regex, point_regex ) filemap = pl.DataFrame(timepoint_list) filemap = fill_empty_timepoints(filemap) filemap = filemap.rename( {"ImagePath": os.path.join(analysis_dir, os.path.basename(subdir))}, ) experiment_filemap = experiment_filemap.join( filemap, on=["Time", "Point"], how="left" ) experiment_filemap = experiment_filemap.fillna("") return experiment_filemap
[docs] def add_dir_to_experiment_filemap( experiment_filemap: pl.DataFrame, dir_path: str, subdir_name: str, time_regex: str = r"Time(\d+)", point_regex: str = r"Point(\d+)", ) -> pl.DataFrame: r""" Add the images contained in a directory to an existing filemap as a new column. Parameters: experiment_filemap (pl.DataFrame): Existing filemap dataframe with at least ``"Time"`` and ``"Point"`` columns. dir_path (str): The path to the directory containing the images. subdir_name (str): The name of the new column to be added to the filemap. time_regex (str): Regular expression pattern to extract time information from file names. (default: r"Time(\d+)") point_regex (str): Regular expression pattern to extract point information from file names. (default: r"Point(\d+)") Returns: pl.DataFrame: Updated filemap dataframe with the new column added, missing entries filled with empty strings. """ subdir_filemap = get_dir_filemap(dir_path, time_regex, point_regex) subdir_filemap = subdir_filemap.rename({"ImagePath": subdir_name}) # check if column already exists if subdir_name in experiment_filemap.columns: experiment_filemap = experiment_filemap.drop(subdir_name) experiment_filemap = experiment_filemap.join( subdir_filemap, on=["Time", "Point"], how="left" ) experiment_filemap = experiment_filemap.fill_nan("").fill_null("") return experiment_filemap
[docs] def read_filemap(filemap_path: str, lazy_loading: bool = False) -> pl.DataFrame: """ Read a filemap from a CSV or Parquet file using Polars. Detects the file format from the ``.parquet`` extension; otherwise treats the file as CSV. When ``lazy_loading`` is ``True`` a ``LazyFrame`` is returned instead of an eager ``DataFrame``. Parameters: filemap_path (str): Path to the filemap file (``.csv`` or ``.parquet``). lazy_loading (bool, optional): If ``True``, return a Polars ``LazyFrame`` instead of a ``DataFrame``. (default: False) Returns: pl.DataFrame: The filemap as a Polars DataFrame (or LazyFrame when ``lazy_loading`` is ``True``). """ if filemap_path.endswith(".parquet"): if lazy_loading: filemap = pl.scan_parquet(filemap_path) else: filemap = pl.read_parquet(filemap_path) else: if lazy_loading: filemap = pl.scan_csv( filemap_path, infer_schema_length=10000, null_values=["np.nan", "[nan]", "", "NaN", "nan", "NA", "N/A"], ) else: filemap = pl.read_csv( filemap_path, infer_schema_length=10000, null_values=["np.nan", "[nan]", "", "NaN", "nan", "NA", "N/A"], ) return filemap
[docs] def write_filemap(filemap: pl.DataFrame, filemap_path: str) -> None: """ Write a filemap to a CSV or Parquet file using Polars. Detects the output format from the ``.parquet`` extension; otherwise writes CSV. Parameters: filemap (pl.DataFrame): The filemap dataframe to write. filemap_path (str): Destination path for the filemap file (``.csv`` or ``.parquet``). Returns: None """ if filemap_path.endswith(".parquet"): filemap.write_parquet(filemap_path) else: filemap.write_csv(filemap_path)