Source code for mappings

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Defines MapCondition class and its subclasses, each represent a single condition
that uses a relationship to transform raw data into a boolean column while
preserving the NA values.
"""
from abc import ABC, abstractmethod
import numpy as np
import pandas as pd

from pycrossva.utils import english_relationship


[docs]class MapCondition(ABC):
    """ Abstract class representing a single mapped condition in the mapping
    data, which gives instructions to transform the raw input data into the
    form needed for a VA instrument. The main configuration class is composed
    of these.

    Attributes:
        name (str): the name of the new column to be created
        relationship (str): the relationship of the input data to the condition
            Should be one of "ge" (greater than or equal to), "gt"
            (greater than), "le" (less than or equal to), "lt" (less than),
            "eq" (equal to), "ne" (not equal to),
            "contains" (if string contains) or "between"
            (between the two numbers, inclusive).
        preq_column (str or None): name of the pre-requisite column if it exists,
            or `None` if no pre-requisite
        source (str): the name of the column to be checked

    """

    def __init__(self, condition_row):
        """Inits SingleMapCondition instance with a row of a valid mapping file
         (or pd.Series or dictionary containing the correct fields)
         to be created.

        Args:
            condition_row (Pandas Series or dict): that contains five key-value
                pairs. Required keys are "New Column Name", "Relationship",
                "Condition", "Prerequisite", and "Source Column ID". This
                should come from a valid mapping file with a Configuration
                object.

        Returns:
            None

        """
        # TODO init checks
        self.condition = condition_row["Condition"]
        self.name = condition_row["New Column Name"]
        self.relationship = condition_row["Relationship"]
        self.preq_column = condition_row["Prerequisite"]
        self.source_name = condition_row["Source Column ID"]

[docs]    def factory(relationship, condition=""):
        """static class factory method, which determines which subclass to
        return

        Args:
            relationship (str): a relationship in (gt, ge, lt, le, ne, eq,
                contains, between) that represents a comparison to be
                made to the raw data
            condition (str or int): the condition being matched. if
                relationship is ambiguous, then this determins if condition
                is numerical or string. Defaults to empty string.

        Returns:
            MapCondition: returns specific subclass that corresponds to the
            correct relationship

        Examples:
            >>> MapCondition.factory("ge") #doctest: +ELLIPSIS
            <class '...NumMapCondition'>

            >>> MapCondition.factory("eq", 0) #doctest: +ELLIPSIS
            <class '...NumMapCondition'>

            >>> MapCondition.factory("eq") #doctest: +ELLIPSIS
            <class '...StrMapCondition'>

            >>> MapCondition.factory("contains") #doctest: +ELLIPSIS
            <class '...ContainsCondition'>

            >>> MapCondition.factory("between") #doctest: +ELLIPSIS
            <class '...BetweenCondition'>

            >>> MapCondition.factory("eqq") #doctest: +ELLIPSIS
            Traceback (most recent call last):
            AssertionError: No defined Condition class for eqq type

        """
        if relationship in ["gt", "ge", "lt", "le"]:
            return NumMapCondition
        if relationship in ["ne", "eq"]:
            if str(condition).isdigit():
                return NumMapCondition
            return StrMapCondition
        if relationship == "contains":
            return ContainsCondition
        if relationship == "between":
            return BetweenCondition
        assert 0, "No defined Condition class for " + relationship + " type"

[docs]    def check(self, prepared_data):
        """Checks the condition against dataframe. Do not check NAs, just add
        them back afterward.

        Args:
            prepared_data (Pandas DataFrame): a dataframe containing a created
                column with the name specified in self.source_dtype

        Returns:
            Array: returns a boolean array where the condition is met (as float)

        Examples:
            >>> test_df = pd.DataFrame({"source_test_str": ["test condition", "test condition 2", np.nan], "source_test_num": [4, 5, np.nan]})
            >>> StrMapCondition({"Condition" : "test condition", "New Column Name" : "test new column name", "Relationship" : "eq", "Prerequisite" : None, "Source Column ID" : "source_test"}).check(test_df)
            array([ 1., 0., nan])

            >>> NumMapCondition({"Condition" : 4.5, "New Column Name" : "test new column name", "Relationship" : "ge", "Prerequisite" : None, "Source Column ID" : "source_test"}).check(test_df)
            array([ 0., 1., nan])
        """
        eval_col = prepared_data[self.source_dtype]

        return np.where(eval_col.notnull(),
                        self._run_check(eval_col),
                        np.nan)

    def _run_check(self, eval_col):
        """internal method to check the condition on a given column with no NAs

        Args:
            eval (Pandas Series): a Pandas Series containing data to evaluate

        Returns:
            Pandas Series: returns a bolean series where the condition is met

        Examples:
            >>> test = pd.Series(["A","B","C"])
            >>> StrMapCondition({"Condition" : "B", "New Column Name" : "test new column name", "Relationship" : "eq", "Prerequisite" : None, "Source Column ID" : "source_test"})._run_check(test)
            0    False
            1     True
            2    False
            dtype: bool

            >>> test = pd.Series([1, 2, 3])
            >>> NumMapCondition({"Condition" : 3,  "New Column Name" : "test new column name",  "Relationship" : "ge",  "Prerequisite" : None,  "Source Column ID" : "source_test"})._run_check(test)
            0    False
            1    False
            2     True
            dtype: bool
        """
        return getattr(eval_col, self.relationship)(self.condition)

[docs]    def check_prereq(self, transformed_data):
        """checks for pre-req column status; if there is no pre-req, returns
        true, else looks up values of pre-req column from transformed_data

        Args:
            transformed_data (Pandas DataFrame): the new dataframe
                being created, which contains any pre-req columns

        Returns:
            boolean or boolean pd.series: representing whether pre-req is
                satisfied

        Examples:
            >>> test_df = pd.DataFrame({"preq_one": np.repeat(True,5),  "preq_two": np.repeat(False, 5)})

            If there is no pre-req, simply returns True (1) Pandas can
            interpret this in boolean indexing.

            >>> NumMapCondition({"Condition" : 4.5,  "New Column Name" : "test new column name",  "Relationship" : "ge",  "Prerequisite" : None,  "Source Column ID" : "source_test"}  ).check_prereq(test_df)
            1

            If there is a pre-req, then returns the value of transformed_data
            with that column.

            >>> NumMapCondition({"Condition" : 4.5,  "New Column Name" : "test new column name",  "Relationship" : "ge",  "Prerequisite" : "preq_one",  "Source Column ID" : "source_test"}  ).check_prereq(test_df)
            0    True
            1    True
            2    True
            3    True
            4    True
            Name: preq_one, dtype: bool

            >>> NumMapCondition({"Condition" : 4.5,  "New Column Name" : "test new column name",  "Relationship" : "ge",  "Prerequisite" : "preq_two",  "Source Column ID" : "source_test"}  ).check_prereq(test_df)
            0    False
            1    False
            2    False
            3    False
            4    False
            Name: preq_two, dtype: bool
        """
        if pd.isnull(self.preq_column) or self.preq_column == "":
            return 1
        return transformed_data[self.preq_column]

    @property
    @abstractmethod
    def possible_values():
        """ abstract method stub
        generate a non-exhaustive list possible values implied by condition """
        return

[docs]    def prepare_data(self, raw_data):
        """prepares raw_data by ensuring dtypes are correct for each comparison

        Args:
            raw_data (dataframe): a data frame containing raw data, including
                the column given in self.source_name.

        Returns:
            Pandas Series: the column in `raw_data` named in self.source_name,  with the attribute self.prep_func applied to it.

        """
        return np.where(raw_data[self.source_name].notnull(),
                        self.prep_func(raw_data[self.source_name]),
                        np.nan)

[docs]    def describe(self):
        """ just a wrapper for the __str__ function """
        return self.__str__()

    def __repr__(self):
        """console representation for class

        Examples:
        >>> NumMapCondition({"Condition" : 4,  "New Column Name" : "test new name",  "Relationship" : "ge",  "Prerequisite" : "preq_two",  "Source Column ID" : "source_test"})
        <NumMapCondition:	 test new name = [column source_test].ge(4.0)>
"""
        return " ".join(["<" + self.__class__.__name__ + ":\t",
                         self.name, "=",
                         "[column " + self.source_name + "]." +
                         self.relationship + "(" + str(self.condition) + ")>"
                         ])

    def __str__(self):
        """str representation

        Examples:
            >>> print(NumMapCondition({"Condition" : 4,  "New Column Name" : "test new column name",  "Relationship" : "ge",  "Prerequisite" : "preq_two",  "Source Column ID" : "source_test"}))
            NumMapCondition: New column test new column name is true where   input data column source_test is greater than or equal to 4.0 and   the new column preq_two is true.

        """
        report_as_list = [self.__class__.__name__ + ":",
                          "New column", self.name,
                          "is true where input data column",
                          self.source_name,
                          english_relationship(self.relationship),
                          str(self.condition)]
        report_as = " ".join(report_as_list)

        #if not self.preq_column is "":
        if self.preq_column != "":
            return (report_as + " and the new column " +
                    str(self.preq_column) + " is true.")
        return report_as


[docs]class StrMapCondition(MapCondition):
    """class representing a str condition, inherits from MapCondition

    Attributes:
        source_dtype (str): instance attribute, a copy of the instance
            attribute self.source_name with "_str" appended,
            to represent the expected dtype
        prep_func (function): class attribute, a function to apply before
            making a string-based comparison. It preserves null values but
            changes all else to str.
    """

    def __init__(self, condition_row):
        """Inits StrMapCondition

        Args:
            condition_row (Pandas Series or dict): see MapCondition's __init__

        Examples:
            >>> StrMapCondition({"Condition" : "test cond",  "New Column Name" : "test new name",  "Relationship" : "eq",  "Prerequisite" : None,  "Source Column ID" : "source_test"}  )
            <StrMapCondition:	 test new name = [column source_test].eq(test cond)>
        """
        super().__init__(condition_row)
        self.source_dtype = self.source_name + "_str"
        self.prep_func = lambda x: x.astype(str)

[docs]    def possible_values(self):
        """generate a non-exhaustive list possible values implied by condition

        Args:   None

        Returns:
            list: list containing 4 possible values (empty string, NA, None,
                and the self.condition attribute) that might be expected by
                this condition

        Examples:
            >>> StrMapCondition({"Condition" : "test condition",  "New Column Name" : "test new column name",  "Relationship" : "eq",  "Prerequisite" : None,  "Source Column ID" : "source_test"}  ).possible_values()
            ['', nan, None, 'test condition', 'yes', 'no', 'dk', 'ref']

        """
        return ["", np.nan, None, self.condition, "yes", "no", "dk", "ref"]


[docs]class NumMapCondition(MapCondition):
    """class representing a numerical condition, inherits from MapCondition

    Attributes:
        source_dtype (str): a copy of the instance attribute self.source_name
            with "_num" appended, to represent the expected dtype
        prep_func (function): class attr, a function to apply before making a
            numerical-based comparison. pd.to_numeric() coerces non-number data
            to NaN.
    """

    def __init__(self, condition_row, cast_cond=True):
        """Inits NumMapCondition

        Args:
            condition_row (Pandas Series or dict): see MapCondition's __init__
            cast_cond (bool): Whether to force self.condition attribute to be a
                float. Defaults to True.

        Examples:
            >>> NumMapCondition({"Condition" : 3,  "New Column Name" : "test new name",  "Relationship" : "ge",  "Prerequisite" : None,  "Source Column ID" : "source_test"})
            <NumMapCondition:	 test new name = [column source_test].ge(3.0)>

        """
        super().__init__(condition_row)
        if cast_cond:
            self.condition = float(condition_row["Condition"])
        self.source_dtype = self.source_name + "_num"
        self.prep_func = lambda x: pd.to_numeric(x, errors="coerce")

[docs]    def possible_values(self):
        """generate a non-exhaustive list of possible values implied by
        condition

        Args:   None

        Returns:
            list: list containing range of possible values. If a greater than
                relationship, the list will include ints from self.condition
                + 1 to self.condition*2. If a less than relationship, it will
                include values from 0 to self.condition. If the condition
                includes "equal to", then self.condition will also be included.

        Examples:
            >>> NumMapCondition({"Condition" : 3,  "New Column Name" : "test new name",  "Relationship" : "ge",  "Prerequisite" : None,  "Source Column ID" : "source_test"}).possible_values()
            [4.0, 5.0, 3.0]
            >>> NumMapCondition({"Condition" : 3,  "New Column Name" : "test new name",  "Relationship" : "lt",  "Prerequisite" : None,  "Source Column ID" : "source_test"}).possible_values()
            [0.0, 1.0, 2.0]

        """
        if self.relationship[0] == "g":
            possible = np.arange(self.condition + 1, self.condition * 2)
        else:
            possible = np.arange(0, self.condition)
        if self.relationship[1] == "e":
            possible = np.append(possible, self.condition)
        return possible.tolist()


[docs]class ContainsCondition(StrMapCondition):
    """ Subclass of StrMapCondition that overrides ._run_check() method for the
    `contains` relationship
    """

    def _run_check(self, eval_col):
        """overides _run_check condition of abstract MapCondition.
        checks condition against input data to see if input data contains
        the substring in the self.condition attribute.


        Args:
            input_data (Pandas DataFrame): a prepared dataframe from a CrossVA
                instance containing a column named self.source_dtype

        Returns:
            boolean Pandas Series: a Pandas Series that is true where the
                column named in self.source_dtype contains the substring in
                self.condition

        Examples:
            >>> test_df = pd.DataFrame({"source_test_1_str": ["test condition",  "test condition 2"],  "source_test_2_str": ["test test",  "test condition"]})
            >>> ContainsCondition({"Condition" : "test condition",  "New Column Name" : "test new column name",  "Relationship" : "contains",  "Prerequisite" : None,  "Source Column ID" : "source_test_1"}).check(test_df)
             array([1., 1.])

            >>> ContainsCondition({"Condition" : "test condition",  "New Column Name" : "test new column name",  "Relationship" : "contains",  "Prerequisite" : None,  "Source Column ID" : "source_test_2"}).check(test_df)
            array([0., 1.])
        """
        return eval_col.fillna("").str.contains(self.condition)


[docs]class BetweenCondition(NumMapCondition):
    """Subclass of NumMapCondition that overrides __init__ and .check() methods
    for the `between` relationship

    Attributes:
        low (float): a float representing the lowest acceptable value (incl)
        high (float): a float representing the highest acceptable value (incl)
    """

    def __init__(self, condition_row):
        """Inits between condition. Same as NumMapCondition init, but do not
        cast condition_row["condition"] to a float, instead split it at " to "
        and convert the first and second elements to floats.

        Args:
            condition_row (Pandas Series or dict): see MapCondition's __init__

        Examples:
            >>> BetweenCondition({"Condition" : "3 to 5",  "New Column Name" : "test new column name",  "Relationship" : "between",  "Prerequisite" : None,  "Source Column ID" : "source_test_1"})
            <BetweenCondition:	 test new column name = [column
            source_test_1].between(3 to 5)>
        """
        super().__init__(condition_row, False)
        self.low, self.high = [float(e) for e in self.condition.split(" to ")]

    def _run_check(self, eval_col):
        """checks condition against input data to see if condition is true.

        Args:
            input_data (Pandas DataFrame): a prepared dataframe from a CrossVA
                instance containing a column named self.source_dtype

        Returns:
            array: true (1) where the column named in
                self.source_dtype has a number between self.low and self.high,
                inclusive

        Examples:
            >>> test_df = pd.DataFrame({"source_test_1_num": [1,2,3],   "source_test_2_num": [4,5,6]})
            >>> BetweenCondition({"Condition" : "3 to 5",  "New Column Name" : "test new column name",  "Relationship" : "between",  "Prerequisite" : None,  "Source Column ID" : "source_test_1"}).check(test_df)
            array([0., 0., 1.])

            >>> BetweenCondition({"Condition" : "3 to 5",  "New Column Name" : "test new column name",  "Relationship" : "between",  "Prerequisite" : None,  "Source Column ID" : "source_test_2"}).check(test_df)
            array([1., 1., 0.])

        """
        return eval_col.between(self.low, self.high)

[docs]    def possible_values(self):
        """generate a non-exhaustive list of possible values implied by the
        condition

        Args:   None

        Returns:
            list: a list of integers between self.low - 1 and self.high + 2

        Examples:
            >>> BetweenCondition({"Condition" : "3 to 5",  "New Column Name" : "test new column name",  "Relationship" : "between",  "Prerequisite" : None,  "Source Column ID" : "source_test_2"}  ).possible_values()
            [2.0, 3.0, 4.0, 5.0, 6.0]

        """
        return np.arange(self.low - 1, self.high + 2).tolist()


if __name__ == "__main__":
    import doctest
    doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)