Source code for configuration

"""
Structure for Configuration class
"""

# !/usr/bin/env python3
#  -*- coding: utf-8 -*-
import pandas as pd
import numpy as np

from pycrossva.mappings import MapCondition
from pycrossva.utils import report_list
from pycrossva.validation import Validation, Err, Warn


[docs]class Configuration(): """ Configuration class details the relationship between a set of input data and output data. It is composed of MapConditions that transform an input data source (2012 WHO, 2016 WHO 141, 2016 WHO 151, PHRMC SHORT) into a different data form (PHRMC SHORT, InSilicoVA, InterVA4, InterVA5, or Tarrif2) for verbal autopsy. Attributes: given_columns (Pandas Series): columns of mapping dataframe. required_columns (Pandas Series): required columns in mapping data. main_columns (list): the four main columns required in config_data. valid_relationships (Pandas Series): contains list of valid relationships to use in comparisons. Relationships should be an attr of Pandas Series object, or be defined as a subclass of MapCondition. config_data (Pandas DataFrame): dataframe containing mapping relationships written out. given_prereq (Pandas Series): lists pre-requisites referenced in config data. new_columns (Pandas Series): lists the new columns to be created with config data. source_columns (Pandas Series): lists the source columns required in the raw input data. verbose (int): controls default verbosity of printing to console. process_strings (boolean): whether or not to remove whitespace and non-alphanumeric characters from strings in condition field and in raw_data during mapping. validation (Validation): a validation object containing the validation checks made """ required_columns = pd.Series(['New Column Name', 'New Column Documentation', 'Source Column ID', 'Source Column Documentation', 'Relationship', 'Condition', 'Prerequisite', ], name="expected columns") main_columns = ["New Column Name", "Source Column ID", "Relationship", "Condition"] valid_relationships = pd.Series(["gt", "ge", "lt", "le", "between", "eq", "ne", "contains"], name="valid relationships") def __init__(self, config_data, verbose=1, process_strings=True): """Inits configuration class from a Pandas DataFrame containing mapping data. Args: config_data (Pandas DataFrame): dataframe representing the mapping data relationships, in specified format. verbose (int): should be in range 1-5, controls print output, where 0=silent. Defaults to 1. process_strings (boolean): Whether or not to normalize string conditions and data. Defaults to True. Returns: Nothing Examples: >>> MAP_PATH = "resources/mapping_configuration_files/" >>> EX_MAP_1 = pd.read_csv(MAP_PATH + "example_config_1.csv") >>> Configuration(EX_MAP_1) Unvalidated Configuration instance with 16 mapping conditions """ # TODO check for df & # of rows & these 3 columns # throw cant instantiate error? self.config_data = config_data self.given_columns = pd.Series(config_data.columns, name="the given columns") self.process_strings = process_strings def plain_info(aseries, new_name): """ returns Pandas Series without NAs or duplicates renamed to `new_name` """ return aseries.drop_duplicates().dropna().rename(new_name) self.given_relationships = plain_info(config_data["Relationship"], "Relationship column") self.given_prereq = plain_info(config_data["Prerequisite"], "Prerequisite column") self.new_columns = plain_info(config_data["New Column Name"], "New Column Name column") self.source_columns = plain_info(config_data["Source Column ID"], "Source Column ID column") self.verbose = verbose self.validation = Validation("Mapping Configuration") def __str__(self): """string representation of class""" return self.__repr__() def __repr__(self): """string representation of class""" val_status = ("Validated" if self.validation.is_valid() else "Unvalidated") return " ".join([val_status, self.__class__.__name__, "instance with", str(len(self.config_data)), "mapping conditions"])
[docs] def list_conditions(self): """Lists the final mapping conditions contained in Configuration object Returns: list: list of MapConditions, where each MapConditions is created from a row of processed mapping data. Examples: >>> MAP_PATH = "resources/mapping_configuration_files/" >>> EX_MAP_1 = pd.read_csv(MAP_PATH + "example_config_1.csv") >>> c = Configuration(EX_MAP_1) >>> c.list_conditions()[:5] [<StrMapCondition: AB_POSIT = [column Id10403].eq(yes)>, <StrMapCondition: AB_SIZE = [column Id10362].eq(yes)>, <NumMapCondition: AC_BRL = [column Id10169].lt(14.0)>, <NumMapCondition: AC_CONV = [column Id10221].lt(10.0)>, <NumMapCondition: AC_COUGH = [column Id10154].lt(21.0)>] """ self.config_data["Standalone"] = self.config_data["Prerequisite"].isnull() # Sort first so that columns w/o prereqs are processed first return [MapCondition.factory(row["Relationship"], row["Condition"])(row) for i, row in self.config_data.sort_values("Standalone", ascending=False).iterrows()]
[docs] def validate(self, verbose=None): """Prepares and validates the Configuration object's mapping conditions. Validation fails if there are any inoperable errors. Problems that can be fixed in place are processed and flagged as warnings. Args: verbose (int): controls print output, should be in range 0-5, each higher level includes the messages of each level below it. Where verbose = 0, nothing will be printed to console. Where verbose = 1, print only errors to console, where verbose = 2, also print warnings, where verbose = 3, also print suggestions and status checks, where verbose = 4, also print passing validation checks, where verbose = 5, also print description of configuration conditions. Defaults to None; if none, replace with self.verbose attribute Returns: Boolean: boolean representing whether there are any errors that prevent validation Examples: >>> MAP_PATH = "resources/mapping_configuration_files/" >>> EX_MAP_2 = pd.read_csv(MAP_PATH + "example_config_2.csv") >>> c = Configuration(EX_MAP_2) >>> c.validate(verbose=4) Validating Mapping Configuration . . . <BLANKLINE> CHECKS PASSED [X] All expected columns ('New Column Name', 'New Column Documentation', 'Source Column ID', 'Source Column Documentation', 'Relationship', 'Condition', and 'Prerequisite') accounted for in configuration file. [X] No leading/trailing spaces column New Column Name detected. [X] No leading/trailing spaces column Relationship detected. [X] No leading/trailing spaces column Prerequisite detected. [X] No leading/trailing spaces column Condition detected. [X] No whitespace in column Condition detected. [X] No upper case value(s) in column Relationship detected. [X] No upper case value(s) in column Condition detected. [X] No non-alphanumeric value(s) in column Source Column ID detected. [X] No non-alphanumeric value(s) in column Relationship detected. [X] No non-alphanumeric value(s) in column Condition detected. [X] No new column(s) listed but not defined in Mapping Configuration detected. [X] No NA's in column New Column Name detected. [X] No NA's in column Source Column ID detected. <BLANKLINE> ERRORS [!] 3 values in Relationship column were invalid ('eqqqq', 'another fake', and 'gee'). These must be a valid method of pd.Series, e.g. ('gt', 'ge', 'lt', 'le', 'between', 'eq', 'ne', and 'contains') to be valid. [!] 2 row(s) containing a numerical relationship with non-number condition detected in row(s) #8, and #9. [!] 2 values in Prerequisite column were invalid ('ABDOMM', and 'Placeholder here'). These must be defined in the 'new column name' column of the config file to be valid. <BLANKLINE> WARNINGS [?] 2 whitespace in column New Column Name detected in row(s) #6, and #8. Whitespace will be converted to '_' [?] 1 whitespace in column Relationship detected in row(s) #4. Whitespace will be converted to '_' [?] 1 whitespace in column Prerequisite detected in row(s) #9. Whitespace will be converted to '_' [?] 1 non-alphanumeric value(s) in column New Column Name detected in row(s) #6. This text should be alphanumeric. Non-alphanumeric characters will be removed. [?] 2 duplicate row(s) detected in row(s) #1, and #14. Duplicates will be dropped. [?] 1 NA's in column Relationship detected in row(s) #3. [?] 1 NA's in column Condition detected in row(s) #6. False """ if verbose is None: verbose = self.verbose # Check that all expected columns accounted for col_passing_msg = " ".join(["All expected columns", report_list(self.required_columns), "accounted for in configuration file."]) self.validation.must_contain(self.given_columns, self.required_columns, passing_msg=col_passing_msg) # reindex - any missing columns become filled with NA self.config_data = self.config_data.reindex( columns=self.required_columns) # Drop any rows that are entirely blank without warnings self.config_data = self.config_data.dropna(how="all") # Processing strings # columns that should contain no whitespace ws_col = ["New Column Name", "Relationship", "Prerequisite"] lowercase_col = ["Relationship"] # columns that should be lowercase if self.process_strings: ws_col.append("Condition") lowercase_col.append("Condition") self.config_data.fillna("na", inplace=True) # fill NAs for str ops # Remove whitespace self.config_data.loc[:, ws_col] = self.validation.fix_whitespace( self.config_data.loc[:, ws_col]) # Check for uppercase characters self.config_data.loc[:, lowercase_col] = self.validation.fix_upcase( self.config_data.loc[:, lowercase_col]) # Check that main columns contain only alphanumeric values self.config_data.loc[:, self.main_columns] = self.validation.fix_alnum( self.config_data.loc[:, self.main_columns]) # Check for missing values self.config_data = self.config_data.replace("na", np.nan) # Check for duplicate rows & drop them self.validation.flag_rows(self.config_data.duplicated(), flag_criteria="duplicate row(s)", flag_action="Duplicates will be dropped.") self.config_data = self.config_data.drop_duplicates() # Check and note if there are missing sources/conditions/rel # ie if we expect any of these sources to be absent defined_no_source = (np.all(self.config_data[["Source Column ID", "Relationship", "Condition"]].isnull(), axis=1) & self.config_data["New Column Name"].notnull()) self.validation.flag_elements( defined_no_source, self.config_data["New Column Name"], criteria="new column(s) listed but not defined") self.config_data = self.config_data.loc[~defined_no_source, :] # Check & drop rows that contain any NAs in main columns self.validation.check_na(self.config_data[self.main_columns]) self.config_data = self.config_data.loc[np.all( self.config_data[self.main_columns].notnull(), axis=1), :] # check all relationships in relationship column are valid self.validation.all_valid(self.given_relationships, self.valid_relationships, "a valid method of pd.Series, e.g. " + report_list(self.valid_relationships)) # check for non-number conditions with numerical relationships invalid_num = (self.config_data["Relationship"].isin( ["gt", "ge", "le", "lt"]) & (pd.to_numeric(self.config_data["Condition"], errors="coerce").isnull())) self.validation.flag_rows(invalid_num, flag_criteria="row(s) containing a numerical" + " relationship with non-number condition", flag_tier=Err) # check all prerequisite columns are also defined in configuration self.validation.all_valid(self.given_prereq, self.new_columns, "defined in the 'new column name' column " + "of the config file") self.validation.report(verbose=verbose) # report if verbose == 5: self.describe() # return true only if there are zero errors return self.validation.is_valid()
[docs] def describe(self): """Prints the mapping relationships in the Configuration object to console. Args: None Returns: None Examples: >>> MAP_PATH = "resources/mapping_configuration_files/" >>> EX_MAP_1 = pd.read_csv(MAP_PATH + "example_config_1.csv") >>> Configuration(EX_MAP_1).describe() MAPPING STATS <BLANKLINE> - 16 new columns produced ('AB_POSIT', 'AB_SIZE', 'AC_BRL', 'AC_CONV', 'AC_COUGH', etc) - 12 source columns required ('Id10403', 'Id10362', 'Id10169', 'Id10221', 'Id10154', etc) - 7 relationships invoked ('eq', 'lt', 'between', 'ge', 'contains', etc) - 13 conditions listed ('yes', '14', '10', '21', '15 to 49', etc) - 1 prerequisites checked ('FEMALE') """ print("MAPPING STATS\n") spacer = " - \t" unique_checks = [("New Column Name", "new columns produced"), ("Source Column ID", "source columns required"), ("Relationship", "relationships invoked"), ("Condition", "conditions listed"), ("Prerequisite", "prerequisites checked")] for col_name, context in unique_checks: print(spacer, self.config_data[col_name].nunique(), context, report_list(self.config_data[col_name].dropna().unique(), limit=5))
[docs]class CrossVA(): """Class representing raw VA data, and how to map it to an algorithm Attributes: mapping (type): a validated Configuration object that details how to transform the type of data in `raw_data` to the desired output. data (Pandas DataFrame): a Pandas DataFrame containing the raw VA data prepared_data (Pandas DataFrame): a Pandas DataFrame containing a prepared form of the VA data to use with the Configuration object. validation (Validation): Validation object containing the validation checks that have been made on the raw data and between the raw data and mapping Configuration. verbose (int): Controls verbosity of printing to console, 0-5 where 0 is silent. """ def __init__(self, raw_data, mapping_config, na_values=["dk", "ref", ""], verbose=2): """Inits CrossVA class Args: raw_data (Pandas DataFrame): a Pandas DataFrame containing the raw data mapping_config (Configuration): a validated Configuration object that details how to transform the type of data in `raw_data` to the desired output. na_values (list): List of values to consider NA. verbose (int): Controls verbosity of printing to console. Defaults to 2. Returns: type: Description of returned object. Examples >>> MAP_PATH = "resources/mapping_configuration_files/" >>> EX_MAP_1 = pd.read_csv(MAP_PATH + "example_config_1.csv") >>> EX_DATA_1 = pd.read_csv("resources/sample_data/mock_data_2016WHO151.csv") >>> CrossVA(EX_DATA_1, Configuration(EX_MAP_1)) <CrossVA with (4, 12) raw data and Validated Configuration instance with 16 mapping conditions> """ # if mapping_config isn't valid, attempt to validate before aborting if not mapping_config.validation.is_valid(): if not mapping_config.validate(): pass # raise error - mapping_config should be a valid # Crop input data to only the columns which have a matching ID # at the end of their name using regex, and any additional columns # passed as an argument, and then rename dataframe to just the column # IDs expected from the mapping self.mapping = mapping_config new_columns = raw_data.columns.str.extract( "(" + "$|".join(self.mapping.source_columns.tolist()) + "$)", expand=False) cropped_data = raw_data.loc[:, new_columns.notnull()].copy() cropped_data = cropped_data.replace(na_values, np.nan) self.data = cropped_data.rename(columns=pd.Series(new_columns, raw_data.columns)) self.prepared_data = pd.DataFrame() self.verbose = verbose self.validation = Validation("Mapping-Data Relationship") def __str__(self): """ str representation method """ return "" def __repr__(self): return ("<" + self.__class__.__name__ + " with " + str(self.data.shape) + " raw data and " + str(self.mapping) + ">")
[docs] def process(self): """Applies the given configuration object's mappings to the given raw data. Args: None Returns: Pandas DataFrame: a dataframe where the transformations specified have been applied to the raw data, resulting """ if not self.validation.is_valid(): if not self.validate(): raise ValueError(("Can't process without valid" " CrossVA instance")) # Create empty dataframe with the list of columns given in mapping # If the new columns listed in the mapping have no definition (ie source, # relationship, and condition) then they will keep their default value # as NA. transformed_data = pd.DataFrame(index=np.arange(len(self.data)), columns=self.mapping.new_columns, dtype=float) transformed_data.columns.name = "" for condition in self.mapping.list_conditions(): # this is the big transformation # create column_values (boolean pd.series) representing if # condition and preq is met in raw data for each row, preserving NA pre_req = condition.check_prereq(transformed_data) initial_condition = condition.check(self.prepared_data) # using 0 for false and 1 for true new_val = np.where((initial_condition == 0) | (pre_req == 0), 0, pre_req * initial_condition) # update new column where new column is currently NA and new values # are True or False. If new column is False, overwrite if new value # is true. If new column is already True, do not overwrite. # This preserves NAs logically, creates implicit ANY relationship # between different conditions for the same new column transformed_data[condition.name] = np.sign( transformed_data[condition.name].add(new_val, fill_value=0)) return transformed_data
[docs] def validate(self, verbose=None): """Validates that RawVAData's raw input data and its mapping configuration object are compatible and prepares input data for use. Args: verbose (int): int from 0 to 5, representing verbosity of printing to console. Defaults to None; if None, replaced with self.verbose attribute. Returns: boolean: True if valid, False if not. Examples: >>> MAP_PATH = "resources/mapping_configuration_files/" >>> EX_MAP_1 = pd.read_csv(MAP_PATH + "example_config_1.csv") >>> EX_DATA_1 = pd.read_csv("resources/sample_data/mock_data_2016WHO151.csv") >>> CrossVA(EX_DATA_1, Configuration(EX_MAP_1)).validate(verbose=0) True """ if verbose is None: verbose = self.verbose if self.mapping.process_strings: # strip whitespace and replace non-trailing/leading with underscore # for str operation convenience self.data.fillna("NA", inplace=True) self.data = self.validation.fix_whitespace(self.data) # make all characters lowercase self.data = self.validation.fix_upcase(self.data) # strip for alphanumeric characters self.data = self.validation.fix_alnum(self.data) self.data = self.data.replace("NA", np.nan) # check all expected columns from config Source Column ID are present col_msg = "All expected columns from mapping file are present in data" self.validation.must_contain(self.data.columns.rename( "the input data columns"), self.mapping.source_columns.rename( "expected source column IDs listed in mapping file"), passing_msg=col_msg, fail=Warn) # warn about associated missing values flagged_missing = ~(self.mapping.config_data["Source Column ID"].isin( self.data.columns)) if flagged_missing.sum() > 0: affected = self.mapping.config_data[flagged_missing].groupby( "Source Column ID")["New Column Name"].apply(list) self.validation.affected_by_absence(affected) self.validation.no_duplicates(self.data.columns) # TODO Check to see if relationship and conditions correspond to # logical values in each column if self.validation.is_valid(): # add missing columns as NA self.data = self.data.reindex(columns=self.mapping.source_columns) # for mapping_condition in self.mapping.list_conditions(): # self.prepared_data[mapping_condition.source_dtype] = \ # mapping_condition.prepare_data(self.data) # get rid of: "PerformanceWarning: DataFrame is highly fragmented." dict_of_cols = {} for mapping_condition in self.mapping.list_conditions(): dict_of_cols[mapping_condition.source_dtype] = \ mapping_condition.prepare_data(self.data) self.prepared_data = pd.concat([self.prepared_data, pd.DataFrame( dict_of_cols)], axis=1) self.validation.report(verbose) return self.validation.is_valid()
if __name__ == "__main__": import doctest doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)