Source code for utils

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Convenience functions for the CrossVA module, which help to provide a more
user-friendly experience with inputs and error messages across different files.
"""
import re
import os

import pandas as pd
import numpy as np

[docs]def report_list(alist, limit=10, paren=True): """Converts alist into a user-friendly string for clearer error messages. Each element is reported single quotes and seperated by commas, with the last element preceded by " and ". When limit is shorter than the list, cuts the list at the limit, omits the 'and', and ends with 'etc' to indicate incompleteness. Args: alist (list): Description of parameter `alist`. limit (int): The maximum number of items to report. If more than limit, the list is reported without conjunction and ends with "etc." Defaults to 10. paren (boolean): Encloses string in parentheses if true. Defaults to True. Returns: str: human-friendly sentence describing the items in alist Examples >>> report_list(["A","B","C"]) "('A', 'B', and 'C')" >>> report_list(["A","B","C"], limit=2) "('A', 'B', etc)" >>> report_list(["A","B","C"], limit=2, paren=False) "'A', 'B', etc" >>> report_list([]) '' """ if len(alist) == 0: return "" if limit is not None: if len(alist) <= limit: limit = None else: alist = alist[:limit] str_list = "'" + "', '".join([str(a) for a in alist]) + "'" if limit is None: report = re.sub(r'(.*), ', r'\1, and ', str_list) # use "and" if complete else: report = str_list + ", etc" # end with etc. if incomplete if paren: return "(" + report + ")" return report
[docs]def flexible_read(path_or_df): """Takes either a path or a Pandas DataFrame, if path, read in as a pandas dataframe. Convenience method to add input flexibility for main transform method. Args: path_or_df (string or Pandas DataFrame): Either a string representing a path to the file containing the data, or a dataframe that has already been read into Python. Returns: Pandas DataFrame: either the data at the given path as read by pandas, or the DataFrame constructor used on the path_or_df argument Examples: Can return a dataframe from a string: >>> flexible_read("resources/sample_data/2016WHO_mock_data_1.csv").iloc[:5,:5] ID -Id10004 -Id10019 -Id10059 -Id10077 0 0 wet dk married dk 1 1 wet female NaN dk 2 2 dry male dk NaN 3 3 dk dk dk dk 4 4 dry NaN married dk Or apply the pandas dataframe constructor to the input: >>> flexible_read(np.arange(9).reshape(3,3)) 0 1 2 0 0 1 2 1 3 4 5 2 6 7 8 """ if isinstance(path_or_df, str): # if mapping is path ext = path_or_df.split(".")[:-1] # file extension if ext in ["xlsm", "xlsx", "xls"]: return_df = pd.read_excel(ext) else: return_df = pd.read_csv(path_or_df) else: return_df = pd.DataFrame(path_or_df) return return_df
[docs]def detect_format(output_format, data): """Detects the format of the input data, determining the closest match Args: output_format (string): The output format, needed for loading the configuration files to test each data (Pandas DataFrame): The data being processed where we wish to determine the most likely format Returns: str: the best matching format for the input data Examples: Can determine the format of a data file: >>> detect_format("InSilicoVA", flexible_read("resources/sample_data/2016WHO_mock_data_1.csv")) '2016WHOv141' """ # Go through all of the SUPPORTED_INPUTS and for each determine # the proportion of inputs that are present in the input data and # choose the best match (the one with the highest proportion) from pycrossva.transform import SUPPORTED_INPUTS from pycrossva.configuration import Configuration, CrossVA config_file_path = os.path.join(os.path.split(__file__)[0], "resources/mapping_configuration_files/") proportions = {} for input_format in SUPPORTED_INPUTS: translation_file = (f"{config_file_path}{input_format}_to_{output_format}.csv") if os.path.isfile(translation_file): # Get a list of the column IDs of the data file that are in the mapping file mapping_data = pd.read_csv(translation_file) mapping_config = Configuration(config_data=mapping_data, process_strings=False) cross_va = CrossVA(data, mapping_config) mapped_data_column_ids = cross_va.data.columns # Get a list of *all* the column IDs in the data file data_column_ids = data.columns # Find the proportion of the column IDs that are mapped proportions[input_format] = len(mapped_data_column_ids) / len(data_column_ids) # Return the supported input that has the highest proportion return max(proportions, key=proportions.get)
[docs]def english_relationship(rel): """Returns abbreviated relationship as full english phrase. Args: rel (str): a string with the relationship being translated, e.g., "gt" Returns: str: a string with the relationship as a longer english phrase e.g., "greater than". If relationship not defined in the dict english, then this method returns rel without modification. Raises: TODO Examples >>> english_relationship("gt") 'is greater than' >>> english_relationship("unknown") 'unknown' """ english = {"gt": "greater than", "ge": "greater than or equal to", "lt": "less than", "le": "less than or equal to", "ne": "not equal to", "eq": "equal to", "between": "between"} if rel in english.keys(): return "is " + english[rel] return rel
if __name__ == "__main__": import doctest doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)