Source code for mappings

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Defines MapCondition class and its subclasses, each represent a single condition
that uses a relationship to transform raw data into a boolean column while
preserving the NA values.
"""
from abc import ABC, abstractmethod
import numpy as np
import pandas as pd

from pycrossva.utils import english_relationship


[docs]class MapCondition(ABC): """ Abstract class representing a single mapped condition in the mapping data, which gives instructions to transform the raw input data into the form needed for a VA instrument. The main configuration class is composed of these. Attributes: name (str): the name of the new column to be created relationship (str): the relationship of the input data to the condition Should be one of "ge" (greater than or equal to), "gt" (greater than), "le" (less than or equal to), "lt" (less than), "eq" (equal to), "ne" (not equal to), "contains" (if string contains) or "between" (between the two numbers, inclusive). preq_column (str or None): name of the pre-requisite column if it exists, or `None` if no pre-requisite source (str): the name of the column to be checked """ def __init__(self, condition_row): """Inits SingleMapCondition instance with a row of a valid mapping file (or pd.Series or dictionary containing the correct fields) to be created. Args: condition_row (Pandas Series or dict): that contains five key-value pairs. Required keys are "New Column Name", "Relationship", "Condition", "Prerequisite", and "Source Column ID". This should come from a valid mapping file with a Configuration object. Returns: None """ # TODO init checks self.condition = condition_row["Condition"] self.name = condition_row["New Column Name"] self.relationship = condition_row["Relationship"] self.preq_column = condition_row["Prerequisite"] self.source_name = condition_row["Source Column ID"]
[docs] def factory(relationship, condition=""): """static class factory method, which determines which subclass to return Args: relationship (str): a relationship in (gt, ge, lt, le, ne, eq, contains, between) that represents a comparison to be made to the raw data condition (str or int): the condition being matched. if relationship is ambiguous, then this determins if condition is numerical or string. Defaults to empty string. Returns: MapCondition: returns specific subclass that corresponds to the correct relationship Examples: >>> MapCondition.factory("ge") #doctest: +ELLIPSIS <class '...NumMapCondition'> >>> MapCondition.factory("eq", 0) #doctest: +ELLIPSIS <class '...NumMapCondition'> >>> MapCondition.factory("eq") #doctest: +ELLIPSIS <class '...StrMapCondition'> >>> MapCondition.factory("contains") #doctest: +ELLIPSIS <class '...ContainsCondition'> >>> MapCondition.factory("between") #doctest: +ELLIPSIS <class '...BetweenCondition'> >>> MapCondition.factory("eqq") #doctest: +ELLIPSIS Traceback (most recent call last): AssertionError: No defined Condition class for eqq type """ if relationship in ["gt", "ge", "lt", "le"]: return NumMapCondition if relationship in ["ne", "eq"]: if str(condition).isdigit(): return NumMapCondition return StrMapCondition if relationship == "contains": return ContainsCondition if relationship == "between": return BetweenCondition assert 0, "No defined Condition class for " + relationship + " type"
[docs] def check(self, prepared_data): """Checks the condition against dataframe. Do not check NAs, just add them back afterward. Args: prepared_data (Pandas DataFrame): a dataframe containing a created column with the name specified in self.source_dtype Returns: Array: returns a boolean array where the condition is met (as float) Examples: >>> test_df = pd.DataFrame({"source_test_str": ["test condition", "test condition 2", np.nan], "source_test_num": [4, 5, np.nan]}) >>> StrMapCondition({"Condition" : "test condition", "New Column Name" : "test new column name", "Relationship" : "eq", "Prerequisite" : None, "Source Column ID" : "source_test"}).check(test_df) array([ 1., 0., nan]) >>> NumMapCondition({"Condition" : 4.5, "New Column Name" : "test new column name", "Relationship" : "ge", "Prerequisite" : None, "Source Column ID" : "source_test"}).check(test_df) array([ 0., 1., nan]) """ eval_col = prepared_data[self.source_dtype] return np.where(eval_col.notnull(), self._run_check(eval_col), np.nan)
def _run_check(self, eval_col): """internal method to check the condition on a given column with no NAs Args: eval (Pandas Series): a Pandas Series containing data to evaluate Returns: Pandas Series: returns a bolean series where the condition is met Examples: >>> test = pd.Series(["A","B","C"]) >>> StrMapCondition({"Condition" : "B", "New Column Name" : "test new column name", "Relationship" : "eq", "Prerequisite" : None, "Source Column ID" : "source_test"})._run_check(test) 0 False 1 True 2 False dtype: bool >>> test = pd.Series([1, 2, 3]) >>> NumMapCondition({"Condition" : 3, "New Column Name" : "test new column name", "Relationship" : "ge", "Prerequisite" : None, "Source Column ID" : "source_test"})._run_check(test) 0 False 1 False 2 True dtype: bool """ return getattr(eval_col, self.relationship)(self.condition)
[docs] def check_prereq(self, transformed_data): """checks for pre-req column status; if there is no pre-req, returns true, else looks up values of pre-req column from transformed_data Args: transformed_data (Pandas DataFrame): the new dataframe being created, which contains any pre-req columns Returns: boolean or boolean pd.series: representing whether pre-req is satisfied Examples: >>> test_df = pd.DataFrame({"preq_one": np.repeat(True,5), "preq_two": np.repeat(False, 5)}) If there is no pre-req, simply returns True (1) Pandas can interpret this in boolean indexing. >>> NumMapCondition({"Condition" : 4.5, "New Column Name" : "test new column name", "Relationship" : "ge", "Prerequisite" : None, "Source Column ID" : "source_test"} ).check_prereq(test_df) 1 If there is a pre-req, then returns the value of transformed_data with that column. >>> NumMapCondition({"Condition" : 4.5, "New Column Name" : "test new column name", "Relationship" : "ge", "Prerequisite" : "preq_one", "Source Column ID" : "source_test"} ).check_prereq(test_df) 0 True 1 True 2 True 3 True 4 True Name: preq_one, dtype: bool >>> NumMapCondition({"Condition" : 4.5, "New Column Name" : "test new column name", "Relationship" : "ge", "Prerequisite" : "preq_two", "Source Column ID" : "source_test"} ).check_prereq(test_df) 0 False 1 False 2 False 3 False 4 False Name: preq_two, dtype: bool """ if pd.isnull(self.preq_column) or self.preq_column == "": return 1 return transformed_data[self.preq_column]
@property @abstractmethod def possible_values(): """ abstract method stub generate a non-exhaustive list possible values implied by condition """ return
[docs] def prepare_data(self, raw_data): """prepares raw_data by ensuring dtypes are correct for each comparison Args: raw_data (dataframe): a data frame containing raw data, including the column given in self.source_name. Returns: Pandas Series: the column in `raw_data` named in self.source_name, with the attribute self.prep_func applied to it. """ return np.where(raw_data[self.source_name].notnull(), self.prep_func(raw_data[self.source_name]), np.nan)
[docs] def describe(self): """ just a wrapper for the __str__ function """ return self.__str__()
def __repr__(self): """console representation for class Examples: >>> NumMapCondition({"Condition" : 4, "New Column Name" : "test new name", "Relationship" : "ge", "Prerequisite" : "preq_two", "Source Column ID" : "source_test"}) <NumMapCondition: test new name = [column source_test].ge(4.0)> """ return " ".join(["<" + self.__class__.__name__ + ":\t", self.name, "=", "[column " + self.source_name + "]." + self.relationship + "(" + str(self.condition) + ")>" ]) def __str__(self): """str representation Examples: >>> print(NumMapCondition({"Condition" : 4, "New Column Name" : "test new column name", "Relationship" : "ge", "Prerequisite" : "preq_two", "Source Column ID" : "source_test"})) NumMapCondition: New column test new column name is true where input data column source_test is greater than or equal to 4.0 and the new column preq_two is true. """ report_as_list = [self.__class__.__name__ + ":", "New column", self.name, "is true where input data column", self.source_name, english_relationship(self.relationship), str(self.condition)] report_as = " ".join(report_as_list) #if not self.preq_column is "": if self.preq_column != "": return (report_as + " and the new column " + str(self.preq_column) + " is true.") return report_as
[docs]class StrMapCondition(MapCondition): """class representing a str condition, inherits from MapCondition Attributes: source_dtype (str): instance attribute, a copy of the instance attribute self.source_name with "_str" appended, to represent the expected dtype prep_func (function): class attribute, a function to apply before making a string-based comparison. It preserves null values but changes all else to str. """ def __init__(self, condition_row): """Inits StrMapCondition Args: condition_row (Pandas Series or dict): see MapCondition's __init__ Examples: >>> StrMapCondition({"Condition" : "test cond", "New Column Name" : "test new name", "Relationship" : "eq", "Prerequisite" : None, "Source Column ID" : "source_test"} ) <StrMapCondition: test new name = [column source_test].eq(test cond)> """ super().__init__(condition_row) self.source_dtype = self.source_name + "_str" self.prep_func = lambda x: x.astype(str)
[docs] def possible_values(self): """generate a non-exhaustive list possible values implied by condition Args: None Returns: list: list containing 4 possible values (empty string, NA, None, and the self.condition attribute) that might be expected by this condition Examples: >>> StrMapCondition({"Condition" : "test condition", "New Column Name" : "test new column name", "Relationship" : "eq", "Prerequisite" : None, "Source Column ID" : "source_test"} ).possible_values() ['', nan, None, 'test condition', 'yes', 'no', 'dk', 'ref'] """ return ["", np.nan, None, self.condition, "yes", "no", "dk", "ref"]
[docs]class NumMapCondition(MapCondition): """class representing a numerical condition, inherits from MapCondition Attributes: source_dtype (str): a copy of the instance attribute self.source_name with "_num" appended, to represent the expected dtype prep_func (function): class attr, a function to apply before making a numerical-based comparison. pd.to_numeric() coerces non-number data to NaN. """ def __init__(self, condition_row, cast_cond=True): """Inits NumMapCondition Args: condition_row (Pandas Series or dict): see MapCondition's __init__ cast_cond (bool): Whether to force self.condition attribute to be a float. Defaults to True. Examples: >>> NumMapCondition({"Condition" : 3, "New Column Name" : "test new name", "Relationship" : "ge", "Prerequisite" : None, "Source Column ID" : "source_test"}) <NumMapCondition: test new name = [column source_test].ge(3.0)> """ super().__init__(condition_row) if cast_cond: self.condition = float(condition_row["Condition"]) self.source_dtype = self.source_name + "_num" self.prep_func = lambda x: pd.to_numeric(x, errors="coerce")
[docs] def possible_values(self): """generate a non-exhaustive list of possible values implied by condition Args: None Returns: list: list containing range of possible values. If a greater than relationship, the list will include ints from self.condition + 1 to self.condition*2. If a less than relationship, it will include values from 0 to self.condition. If the condition includes "equal to", then self.condition will also be included. Examples: >>> NumMapCondition({"Condition" : 3, "New Column Name" : "test new name", "Relationship" : "ge", "Prerequisite" : None, "Source Column ID" : "source_test"}).possible_values() [4.0, 5.0, 3.0] >>> NumMapCondition({"Condition" : 3, "New Column Name" : "test new name", "Relationship" : "lt", "Prerequisite" : None, "Source Column ID" : "source_test"}).possible_values() [0.0, 1.0, 2.0] """ if self.relationship[0] == "g": possible = np.arange(self.condition + 1, self.condition * 2) else: possible = np.arange(0, self.condition) if self.relationship[1] == "e": possible = np.append(possible, self.condition) return possible.tolist()
[docs]class ContainsCondition(StrMapCondition): """ Subclass of StrMapCondition that overrides ._run_check() method for the `contains` relationship """ def _run_check(self, eval_col): """overides _run_check condition of abstract MapCondition. checks condition against input data to see if input data contains the substring in the self.condition attribute. Args: input_data (Pandas DataFrame): a prepared dataframe from a CrossVA instance containing a column named self.source_dtype Returns: boolean Pandas Series: a Pandas Series that is true where the column named in self.source_dtype contains the substring in self.condition Examples: >>> test_df = pd.DataFrame({"source_test_1_str": ["test condition", "test condition 2"], "source_test_2_str": ["test test", "test condition"]}) >>> ContainsCondition({"Condition" : "test condition", "New Column Name" : "test new column name", "Relationship" : "contains", "Prerequisite" : None, "Source Column ID" : "source_test_1"}).check(test_df) array([1., 1.]) >>> ContainsCondition({"Condition" : "test condition", "New Column Name" : "test new column name", "Relationship" : "contains", "Prerequisite" : None, "Source Column ID" : "source_test_2"}).check(test_df) array([0., 1.]) """ return eval_col.fillna("").str.contains(self.condition)
[docs]class BetweenCondition(NumMapCondition): """Subclass of NumMapCondition that overrides __init__ and .check() methods for the `between` relationship Attributes: low (float): a float representing the lowest acceptable value (incl) high (float): a float representing the highest acceptable value (incl) """ def __init__(self, condition_row): """Inits between condition. Same as NumMapCondition init, but do not cast condition_row["condition"] to a float, instead split it at " to " and convert the first and second elements to floats. Args: condition_row (Pandas Series or dict): see MapCondition's __init__ Examples: >>> BetweenCondition({"Condition" : "3 to 5", "New Column Name" : "test new column name", "Relationship" : "between", "Prerequisite" : None, "Source Column ID" : "source_test_1"}) <BetweenCondition: test new column name = [column source_test_1].between(3 to 5)> """ super().__init__(condition_row, False) self.low, self.high = [float(e) for e in self.condition.split(" to ")] def _run_check(self, eval_col): """checks condition against input data to see if condition is true. Args: input_data (Pandas DataFrame): a prepared dataframe from a CrossVA instance containing a column named self.source_dtype Returns: array: true (1) where the column named in self.source_dtype has a number between self.low and self.high, inclusive Examples: >>> test_df = pd.DataFrame({"source_test_1_num": [1,2,3], "source_test_2_num": [4,5,6]}) >>> BetweenCondition({"Condition" : "3 to 5", "New Column Name" : "test new column name", "Relationship" : "between", "Prerequisite" : None, "Source Column ID" : "source_test_1"}).check(test_df) array([0., 0., 1.]) >>> BetweenCondition({"Condition" : "3 to 5", "New Column Name" : "test new column name", "Relationship" : "between", "Prerequisite" : None, "Source Column ID" : "source_test_2"}).check(test_df) array([1., 1., 0.]) """ return eval_col.between(self.low, self.high)
[docs] def possible_values(self): """generate a non-exhaustive list of possible values implied by the condition Args: None Returns: list: a list of integers between self.low - 1 and self.high + 2 Examples: >>> BetweenCondition({"Condition" : "3 to 5", "New Column Name" : "test new column name", "Relationship" : "between", "Prerequisite" : None, "Source Column ID" : "source_test_2"} ).possible_values() [2.0, 3.0, 4.0, 5.0, 6.0] """ return np.arange(self.low - 1, self.high + 2).tolist()
if __name__ == "__main__": import doctest doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)