Source code for exm.args.args

"""
Sets up the project parameters with enhanced configurability.
"""
import os
import json
import glob
import pathlib
import yaml
from pathlib import Path
from nd2reader import ND2Reader
from typing import List, Optional, Dict, Any, Union
from exm.utils.log import configure_logger
logger = configure_logger('ExSeq-Toolbox')


[docs] class Args: r""" A class used to represent and manage the configuration for ExSeq-Toolbox. Attributes are set using the `set_params` method, and the configuration can be saved to or loaded from a JSON file. Directory structures for the project can also be created, and file permissions can be modified as needed. **Methods:** - `set_params`: Sets various parameters for the ExSeq-Toolbox. - `save_params`: Saves the current parameters to a JSON file. - `load_params`: Loads parameters from a JSON file. - `create_directory_structure`: Creates the necessary directory structure for the project. - `set_permissions`: Sets file permissions for the project directory. - `print`: Prints all current parameters. """ def __init__(self): # Enhanced processing parameters self.chunk_size = 100 self.gpu_memory_fraction = 0.8 self.background_subtraction_radius = 50 self.auto_cleanup_memory = True self.parallel_processes = self._auto_detect_parallel_processes() # Auto-detect self.use_gpu_processing = self._auto_detect_gpu() # Alignment parameters self.alignment_downsample_factors = (2, 4, 4) self.alignment_low_percentile = 1.0 self.alignment_high_percentile = 99.0 # Puncta extraction parameters self.puncta_min_distance = 7 self.puncta_gaussian_sigma = 1.0 self.puncta_exclude_border = False self.consolidation_distance_threshold = 8.0 # Basecalling parameters self.hamming_distance_threshold = 2 # System parameters self.permission_mode = 0o777 self.temp_directory = None def __str__(self): r"""Returns a string representation of the Args object.""" return str(self.__dict__) def __repr__(self): r"""Returns a string that reproduces the Args object when fed to eval().""" return self.__str__()
[docs] def set_params(self, raw_data_path: str, processed_data_path: Optional[str] = None, puncta_dir_name: Optional[str] = 'puncta/', codes: List[int] = list(range(7)), fovs: Optional[List[int]] = None, spacing: List[float] = [0.4, 1.625, 1.625], channel_names: List[str] = [ '640', '594', '561', '488', '405'], ref_code: int = 0, ref_channel: str = '405', gene_digit_csv: str = './gene_list.csv', permission: Optional[bool] = False, create_directroy_structure: Optional[bool] = True, args_file_name: Optional[str] = 'exseq_toolbox_args', # Enhanced parameters chunk_size: Optional[int] = None, gpu_memory_fraction: Optional[float] = None, parallel_processes: Optional[int] = None, use_gpu_processing: Optional[bool] = None, puncta_thresholds: Optional[List[int]] = None, auto_cleanup_memory: Optional[bool] = None, **kwargs) -> None: r""" Sets parameters for running ExSeq ToolBox. :param raw_data_path: The absolute path to the project's raw data directory (.nd2 files). There is no default value, this must be provided. :type raw_data_path: str :param processed_data_path: The absolute path to the processed data directory. Default is a 'processed_data' subdirectory inside the raw_data_path. :type processed_data_path: Optional[str] :param puncta_dir_name: The directory name to store the puncta analysis in the processed data directory. Default is a 'puncta' subdirectory inside the processed_data_path. :type puncta_dir_name: Optional[str] :param codes: A list of integers, each representing a specific code. Default: integers 0-6. :type codes: List[int] :param fovs: A list of integers, each representing a specific field of view. Default: ``None``. :type fovs: Optional[List[int]] :param spacing: Spacing between pixels in the format [Z,Y,X]. Default: [4.0, 1.625, 1.625]. :type spacing: List[float] :param channel_names: Names of channels in the ND2 file *in the correct sequence*. Default is ['640','594','561','488','405']. :type channel_names: List[str] :param ref_code: Specifies which code to use as the reference round. Default: 0. :type ref_code: int :param ref_channel: Specifies which channel to use as the reference for alignment. Default is '405'. :type ref_channel: str :param gene_digit_csv: absolute path of the CSV file containing gene list. Default: './gene_list.csv'. :type gene_digit_csv: str :param permission: If set to ``True``, changes permission of the raw_data_path to allow other users to read and write on the generated files. Default is ``False``. `Only for Linux and MacOS users` :type permission: Optional[bool] :param create_directroy_structure: If set to ``True``, creates the directory structure in the specified processed_data_path. Default: ``True``. :type create_directroy_structure: Optional[bool] :param args_file_name: The name of the JSON file to store the project arguments. Default: 'exseq_toolbox_args'. :type args_file_name: Optional[str] """ self.raw_data_path = os.path.abspath(raw_data_path) self.puncta_dir_name = puncta_dir_name self.codes = codes self.channel_names = channel_names self.spacing = spacing self.permission = permission self.ref_code = ref_code self.ref_channel = ref_channel self.gene_digit_csv = gene_digit_csv # Housekeeping self.code2num = {'a': '0', 'c': '1', 'g': '2', 't': '3'} self.colors = ['red', 'yellow', 'green', 'blue'] self.colorscales = ['Reds', 'Oranges', 'Greens', 'Blues'] self.thresholds = [200, 300, 300, 200] self.data_path = os.path.join( self.raw_data_path, "code{}/raw_fov{}.h5" ) if processed_data_path is not None: self.processed_data_path = os.path.abspath(processed_data_path) else: self.processed_data_path = os.path.join( self.raw_data_path, "processed_data") self.h5_path = os.path.join(self.processed_data_path, "code{}/{}.h5") self.tform_path = os.path.join( self.processed_data_path, "code{}/tforms/{}") self.puncta_path = os.path.join( self.processed_data_path, self.puncta_dir_name) if not fovs and "fovs" not in dir(self): fovs_num = len(glob.glob(self.data_path.format(0,"*"))) self.fovs = list(range(fovs_num)) else: self.fovs = fovs if create_directroy_structure is not None: self.create_directroy_structure() if permission: self.set_permissions() # Set enhanced parameters if chunk_size is not None: self.chunk_size = chunk_size if gpu_memory_fraction is not None: self.gpu_memory_fraction = gpu_memory_fraction if parallel_processes is not None: self.parallel_processes = parallel_processes if use_gpu_processing is not None: self.use_gpu_processing = use_gpu_processing if puncta_thresholds is not None: self.thresholds = puncta_thresholds if auto_cleanup_memory is not None: self.auto_cleanup_memory = auto_cleanup_memory # Apply any additional keyword arguments for key, value in kwargs.items(): if hasattr(self, key): setattr(self, key, value) logger.debug(f"Set parameter {key} = {value}") self.save_params(args_file_name)
[docs] def save_params(self, args_file_name): r"""Saves the parameters to a .json file. :param args_file_name: Name of the parameters file. :type args_file_name: str """ try: with open(os.path.join(self.processed_data_path, args_file_name + '.json'), "w") as f: json.dump(self.__dict__, f) except Exception as e: logger.error(f"Failed to save configuration. Error: {e}") raise
[docs] def load_params(self, param_path: str) -> None: r"""Loads and sets the configuration parameters from a previously saved .json file. :param param_path: The path to the '.json' file containing the serialized parameters. This is typically the 'exseq_toolbox_args.json' file generated by the `set_params` call, located within the processed data directory. :type param_path: str """ try: param_path = os.path.abspath(param_path) with open(param_path, "r") as f: self.__dict__.update(json.load(f)) except Exception as e: logger.error(f"Failed to load parameters. Error: {e}") raise
[docs] def create_directroy_structure(self): r"""Creates the directory structure in the specified project path.""" from exm.io import create_folder_structure try: create_folder_structure(str(self.processed_data_path), str( self.puncta_dir_name), self.fovs, self.codes) except Exception as e: logger.error(f"Failed to create directory structure. Error: {e}") raise
[docs] def set_permissions(self): r"""Changes permission of the processed_data_path to allow other users to read and write on the generated files.""" try: from exm.utils.utils import chmod chmod(pathlib.Path(self.processed_data_path)) except Exception as e: logger.error(f"Failed to set permissions. Error: {e}") raise
[docs] def print(self) -> None: r"""Prints all attributes of the Args object.""" try: for key, value in self.__dict__.items(): print(f"{key}: {value}") except Exception as e: logger.error(f"Failed to print parameters. Error: {e}") raise
def _auto_detect_parallel_processes(self) -> int: """Auto-detect optimal number of parallel processes.""" try: import multiprocessing import psutil cpu_count = multiprocessing.cpu_count() memory_gb = psutil.virtual_memory().total / (1024**3) # Conservative estimate: 1 process per 4GB RAM, max 8 processes max_by_memory = max(1, int(memory_gb / 4)) optimal = min(cpu_count, max_by_memory, 8) logger.info(f"Auto-detected {optimal} parallel processes (CPU: {cpu_count}, Memory: {memory_gb:.1f}GB)") return optimal except ImportError: logger.warning("Could not auto-detect parallel processes, using 1") return 1 def _auto_detect_gpu(self) -> bool: """Auto-detect GPU availability.""" try: import cupy gpu_count = cupy.cuda.runtime.getDeviceCount() logger.info(f"GPU detected: {gpu_count} device(s) available") return True except ImportError: logger.info("GPU not available (CuPy not installed)") return False except Exception as e: logger.warning(f"GPU detection failed: {e}") return False
[docs] def get_memory_config(self): """Get memory configuration dictionary.""" return { 'chunk_size': self.chunk_size, 'gpu_memory_fraction': self.gpu_memory_fraction, 'auto_cleanup': self.auto_cleanup_memory }
[docs] def save_config_yaml(self, filename: str) -> None: """Save configuration in YAML format for better readability.""" config = { 'data_paths': { 'raw_data_path': self.raw_data_path, 'processed_data_path': self.processed_data_path, 'puncta_dir_name': self.puncta_dir_name, 'gene_digit_csv': self.gene_digit_csv, }, 'experiment': { 'codes': self.codes, 'fovs': self.fovs, 'spacing': self.spacing, 'channel_names': self.channel_names, 'ref_code': self.ref_code, 'ref_channel': self.ref_channel, }, 'processing': { 'chunk_size': self.chunk_size, 'parallel_processes': self.parallel_processes, 'use_gpu_processing': self.use_gpu_processing, 'gpu_memory_fraction': self.gpu_memory_fraction, 'auto_cleanup_memory': self.auto_cleanup_memory, }, 'alignment': { 'downsample_factors': list(self.alignment_downsample_factors), 'low_percentile': self.alignment_low_percentile, 'high_percentile': self.alignment_high_percentile, }, 'puncta': { 'thresholds': self.thresholds, 'min_distance': self.puncta_min_distance, 'gaussian_sigma': self.puncta_gaussian_sigma, 'exclude_border': self.puncta_exclude_border, 'consolidation_distance_threshold': self.consolidation_distance_threshold, }, 'system': { 'permission': self.permission, 'permission_mode': self.permission_mode, } } try: with open(filename, 'w') as f: yaml.dump(config, f, default_flow_style=False, indent=2) logger.info(f"Configuration saved to {filename}") except Exception as e: logger.error(f"Failed to save YAML configuration: {e}") raise
[docs] def load_config_yaml(self, filename: str) -> None: """Load configuration from YAML file.""" try: with open(filename, 'r') as f: config = yaml.safe_load(f) # Apply configuration sections if 'data_paths' in config: for key, value in config['data_paths'].items(): if hasattr(self, key): setattr(self, key, value) if 'experiment' in config: for key, value in config['experiment'].items(): if hasattr(self, key): setattr(self, key, value) if 'processing' in config: for key, value in config['processing'].items(): if hasattr(self, key): setattr(self, key, value) if 'alignment' in config: for key, value in config['alignment'].items(): attr_name = f'alignment_{key}' if hasattr(self, attr_name): setattr(self, attr_name, value) if 'puncta' in config: for key, value in config['puncta'].items(): if key == 'thresholds': self.thresholds = value else: attr_name = f'puncta_{key}' if hasattr(self, attr_name): setattr(self, attr_name, value) if 'system' in config: for key, value in config['system'].items(): if hasattr(self, key): setattr(self, key, value) logger.info(f"Configuration loaded from {filename}") except Exception as e: logger.error(f"Failed to load YAML configuration: {e}") raise
[docs] def get_processing_recommendations(self) -> Dict[str, Any]: """Get processing recommendations based on current configuration.""" return { 'current_chunk_size': self.chunk_size, 'current_parallel_processes': self.parallel_processes, 'gpu_enabled': self.use_gpu_processing, 'auto_cleanup_enabled': self.auto_cleanup_memory, }