Source code for missense_kinase_toolkit.databases.io_utils

import os
import pandas as pd


OUTPUT_DIR_VAR = "OUTPUT_DIR"
"""str: Environment variable for output directory"""


[docs] def check_outdir_exists( ) -> str: """Check if OUTPUT_DIR in environmental variables and create directory if doesn't exist. Returns ------- str | None Path to OUTPUT_DIR """ try: path_data = os.environ[OUTPUT_DIR_VAR] if not os.path.exists(path_data): os.makedirs(path_data) except KeyError: print("OUTPUT_DIR not found in environment variables...") return path_data
[docs] def convert_str2list( input_str: str ) -> list[str]: """Convert a string to a list. Parameters ---------- str : str String to convert to list Returns ------- list[str] List of strings """ list_str = input_str.split(",") list_str = [str_in.strip() for str_in in list_str] return list_str
[docs] def load_csv_to_dataframe( filename: str, ) -> None: """Load a CSV file as a dataframe Parameters ---------- filename : str Filename to load (either with or without "csv" suffix) Returns ------- df : pd.DataFrame Dataframe loaded from CSV file """ filename = filename.replace(".csv", "") + ".csv" path_data = check_outdir_exists() try: df = pd.read_csv(os.path.join(path_data, filename)) except FileNotFoundError: print(f"File {filename} not found in {path_data}...") return df
[docs] def save_dataframe_to_csv( df: pd.DataFrame, filename: str, ) -> None: """Save a dataframe to a CSV file. Parameters ---------- df : pd.DataFrame Dataframe to save filename : str Filename to save (either with or without "csv" suffix) Returns ------- None """ filename = filename.replace(".csv", "") + ".csv" path_data = check_outdir_exists() df.to_csv(os.path.join(path_data, filename), index=False)
[docs] def concatenate_csv_files_with_glob( str_find: str, str_remove: str = "transformed_mutations.csv", ) -> pd.DataFrame: """Use glob to find csv files to concatenate. Parameters ---------- str_find: str String to use to find files containing csv files of interest Return ------ pd.DataFrame Concatenated dataframe """ import glob str_find = str_find.replace(".csv", "") + ".csv" path_data = check_outdir_exists() csv_files = glob.glob(os.path.join(path_data, str_find)) csv_files = [csv_file for csv_file in csv_files if str_remove not in csv_file] df_combo = pd.DataFrame() if len(csv_files) > 0: for csv_file in csv_files: df = pd.read_csv(csv_file, low_memory=False) df_combo = pd.concat([df_combo, df]) else: print(f"No files matching {str_find} found in {path_data}...") #TODO: implement remove duplicates return df_combo
[docs] def parse_iterabc2dataframe( input_object: iter, ) -> pd.DataFrame: """Parse an iterable containing Abstract Base Classes into a dataframe. Parameters ---------- input_object : iter Iterable of Abstract Base Classes objects Returns ------- pd.DataFrame Dataframe for the input list of Abstract Base Classes objects """ list_dir = [dir(entry) for entry in input_object] set_dir = {item for sublist in list_dir for item in sublist} dict_dir = {attr: [] for attr in set_dir} for entry in input_object: for attr in dict_dir.keys(): try: dict_dir[attr].append(getattr(entry, attr)) except AttributeError: dict_dir[attr].append(None) df = pd.DataFrame.from_dict(dict_dir) df = df[sorted(df.columns.to_list())] return df