Source code for configuration

"""
Structure for Configuration class
"""

# !/usr/bin/env python3
#  -*- coding: utf-8 -*-
import pandas as pd
import numpy as np

from pycrossva.mappings import MapCondition
from pycrossva.utils import report_list
from pycrossva.validation import Validation, Err, Warn


[docs]class Configuration():
    """ Configuration class details the relationship between a set of input
    data and output data. It is composed of MapConditions that
    transform an input data source (2012 WHO, 2016 WHO 141, 2016 WHO 151,
    PHRMC SHORT) into a different data form (PHRMC SHORT, InSilicoVA,
    InterVA4, InterVA5, or Tarrif2) for verbal autopsy.

    Attributes:
        given_columns (Pandas Series): columns of mapping dataframe.
        required_columns (Pandas Series): required columns in mapping data.
        main_columns (list): the four main columns required in config_data.
        valid_relationships (Pandas Series): contains list of valid
            relationships to use in comparisons. Relationships should be an
            attr of Pandas Series object, or be defined as a subclass of
            MapCondition.
        config_data (Pandas DataFrame): dataframe containing mapping
            relationships written out.
        given_prereq (Pandas Series): lists pre-requisites referenced in config
            data.
        new_columns (Pandas Series): lists the new columns to be created with
            config data.
        source_columns (Pandas Series): lists the source columns required in
            the raw input data.
        verbose (int): controls default verbosity of printing to console.
        process_strings (boolean): whether or not to remove whitespace and
            non-alphanumeric characters from strings in condition field and in
            raw_data during mapping.
        validation (Validation): a validation object containing the validation
            checks made

    """
    required_columns = pd.Series(['New Column Name',
                                  'New Column Documentation',
                                  'Source Column ID',
                                  'Source Column Documentation',
                                  'Relationship',
                                  'Condition', 'Prerequisite',
                                  ],
                                 name="expected columns")

    main_columns = ["New Column Name", "Source Column ID",
                    "Relationship", "Condition"]

    valid_relationships = pd.Series(["gt", "ge", "lt", "le", "between", "eq",
                                     "ne", "contains"],
                                    name="valid relationships")

    def __init__(self, config_data, verbose=1, process_strings=True):
        """Inits configuration class from a Pandas DataFrame containing
            mapping data.

        Args:
            config_data (Pandas DataFrame): dataframe representing the mapping
                data relationships, in specified format.
            verbose (int): should be in range 1-5,
                controls print output, where 0=silent. Defaults to 1.
            process_strings (boolean): Whether or not to normalize string
                conditions and data. Defaults to True.

        Returns:
            Nothing

        Examples:
            >>> MAP_PATH = "resources/mapping_configuration_files/"
            >>> EX_MAP_1 = pd.read_csv(MAP_PATH + "example_config_1.csv")
            >>> Configuration(EX_MAP_1)
            Unvalidated Configuration instance with 16 mapping conditions

        """
        # TODO check for df & # of rows & these 3 columns
        # throw cant instantiate error?

        self.config_data = config_data
        self.given_columns = pd.Series(config_data.columns,
                                       name="the given columns")
        self.process_strings = process_strings

        def plain_info(aseries, new_name):
            """ returns Pandas Series without NAs or duplicates renamed to
            `new_name` """
            return aseries.drop_duplicates().dropna().rename(new_name)

        self.given_relationships = plain_info(config_data["Relationship"],
                                              "Relationship column")
        self.given_prereq = plain_info(config_data["Prerequisite"],
                                       "Prerequisite column")
        self.new_columns = plain_info(config_data["New Column Name"],
                                      "New Column Name column")
        self.source_columns = plain_info(config_data["Source Column ID"],
                                         "Source Column ID column")
        self.verbose = verbose
        self.validation = Validation("Mapping Configuration")

    def __str__(self):
        """string representation of class"""
        return self.__repr__()

    def __repr__(self):
        """string representation of class"""
        val_status = ("Validated" if self.validation.is_valid() else
                      "Unvalidated")
        return " ".join([val_status, self.__class__.__name__, "instance with",
                         str(len(self.config_data)), "mapping conditions"])

[docs]    def list_conditions(self):
        """Lists the final mapping conditions contained in Configuration object

        Returns:
            list: list of MapConditions, where each MapConditions is
            created from a row of processed mapping data.

        Examples:
            >>> MAP_PATH = "resources/mapping_configuration_files/"
            >>> EX_MAP_1 = pd.read_csv(MAP_PATH + "example_config_1.csv")
            >>> c = Configuration(EX_MAP_1)
            >>> c.list_conditions()[:5]
            [<StrMapCondition:     AB_POSIT = [column Id10403].eq(yes)>,
             <StrMapCondition:     AB_SIZE = [column Id10362].eq(yes)>,
             <NumMapCondition:     AC_BRL = [column Id10169].lt(14.0)>,
             <NumMapCondition:     AC_CONV = [column Id10221].lt(10.0)>,
             <NumMapCondition:     AC_COUGH = [column Id10154].lt(21.0)>]

        """
        self.config_data["Standalone"] = self.config_data["Prerequisite"].isnull()
        # Sort first so that columns w/o prereqs are processed first
        return [MapCondition.factory(row["Relationship"], row["Condition"])(row)
                for i, row in self.config_data.sort_values("Standalone",
                                                           ascending=False).iterrows()]

[docs]    def validate(self, verbose=None):
        """Prepares and validates the Configuration object's mapping conditions.
        Validation fails if there are any inoperable errors. Problems that can
        be fixed in place are processed and flagged as warnings.

        Args:
            verbose (int): controls print output, should be in range 0-5,
                each higher level includes the messages of each level below it.
                Where verbose = 0, nothing will be printed to console.
                Where verbose = 1, print only errors to console,
                where verbose = 2, also print warnings,
                where verbose = 3, also print suggestions and status checks,
                where verbose = 4, also print passing validation checks,
                where verbose = 5, also print description of configuration
                conditions.
                Defaults to None; if none, replace with self.verbose attribute
        Returns:
            Boolean: boolean representing whether there are any errors that
                prevent validation

        Examples:
            >>> MAP_PATH = "resources/mapping_configuration_files/"
            >>> EX_MAP_2 = pd.read_csv(MAP_PATH + "example_config_2.csv")
            >>> c = Configuration(EX_MAP_2)
            >>> c.validate(verbose=4)
            Validating Mapping Configuration . . .
            <BLANKLINE>
             CHECKS PASSED
            [X]          All expected columns ('New Column Name', 'New Column Documentation', 'Source Column ID', 'Source Column Documentation', 'Relationship', 'Condition', and 'Prerequisite') accounted for in configuration file.
            [X]          No leading/trailing spaces column New Column Name detected.
            [X]          No leading/trailing spaces column Relationship detected.
            [X]          No leading/trailing spaces column Prerequisite detected.
            [X]          No leading/trailing spaces column Condition detected.
            [X]          No whitespace in column Condition detected.
            [X]          No upper case value(s) in column Relationship detected.
            [X]          No upper case value(s) in column Condition detected.
            [X]          No non-alphanumeric value(s) in column Source Column ID detected.
            [X]          No non-alphanumeric value(s) in column Relationship detected.
            [X]          No non-alphanumeric value(s) in column Condition detected.
            [X]          No new column(s) listed but not defined in Mapping Configuration detected.
            [X]          No NA's in column New Column Name detected.
            [X]          No NA's in column Source Column ID detected.
            <BLANKLINE>
             ERRORS
            [!]          3 values in Relationship column were invalid ('eqqqq', 'another fake', and 'gee'). These must be a valid method of pd.Series, e.g. ('gt', 'ge', 'lt', 'le', 'between', 'eq', 'ne', and 'contains') to be valid.
            [!]          2 row(s) containing a numerical relationship with non-number condition detected in row(s) #8, and #9.
            [!]          2 values in Prerequisite column were invalid ('ABDOMM', and 'Placeholder here'). These must be defined in the 'new column name' column of the config file to be valid.
            <BLANKLINE>
             WARNINGS
            [?]          2 whitespace in column New Column Name detected in row(s) #6, and #8. Whitespace will be converted to '_'
            [?]          1 whitespace in column Relationship detected in row(s) #4. Whitespace will be converted to '_'
            [?]          1 whitespace in column Prerequisite detected in row(s) #9. Whitespace will be converted to '_'
            [?]          1 non-alphanumeric value(s) in column New Column Name detected in row(s) #6. This text should be alphanumeric. Non-alphanumeric characters will be removed.
            [?]          2 duplicate row(s) detected in row(s) #1, and #14. Duplicates will be dropped.
            [?]          1 NA's in column Relationship detected in row(s) #3.
            [?]          1 NA's in column Condition detected in row(s) #6.
            False
        """
        if verbose is None:
            verbose = self.verbose

        # Check that all expected columns accounted for
        col_passing_msg = " ".join(["All expected columns",
                                    report_list(self.required_columns),
                                    "accounted for in configuration file."])
        self.validation.must_contain(self.given_columns,
                                     self.required_columns,
                                     passing_msg=col_passing_msg)

        # reindex - any missing columns become filled with NA
        self.config_data = self.config_data.reindex(
            columns=self.required_columns)

        # Drop any rows that are entirely blank without warnings
        self.config_data = self.config_data.dropna(how="all")

        # Processing strings
        # columns that should contain no whitespace
        ws_col = ["New Column Name", "Relationship", "Prerequisite"]
        lowercase_col = ["Relationship"]  # columns that should be lowercase
        if self.process_strings:
            ws_col.append("Condition")
            lowercase_col.append("Condition")
        self.config_data.fillna("na", inplace=True)  # fill NAs for str ops

        # Remove whitespace
        self.config_data.loc[:, ws_col] = self.validation.fix_whitespace(
            self.config_data.loc[:, ws_col])

        # Check for uppercase characters
        self.config_data.loc[:, lowercase_col] = self.validation.fix_upcase(
            self.config_data.loc[:, lowercase_col])

        # Check that main columns contain only alphanumeric values
        self.config_data.loc[:, self.main_columns] = self.validation.fix_alnum(
            self.config_data.loc[:, self.main_columns])

        # Check for missing values
        self.config_data = self.config_data.replace("na", np.nan)

        # Check for duplicate rows & drop them
        self.validation.flag_rows(self.config_data.duplicated(),
                                  flag_criteria="duplicate row(s)",
                                  flag_action="Duplicates will be dropped.")
        self.config_data = self.config_data.drop_duplicates()

        # Check and note if there are missing sources/conditions/rel
        # ie if we expect any of these sources to be absent
        defined_no_source = (np.all(self.config_data[["Source Column ID",
                                                      "Relationship", "Condition"]].isnull(), axis=1)
                             & self.config_data["New Column Name"].notnull())

        self.validation.flag_elements(
            defined_no_source,
            self.config_data["New Column Name"],
            criteria="new column(s) listed but not defined")

        self.config_data = self.config_data.loc[~defined_no_source, :]

        # Check & drop rows that contain any NAs in main columns
        self.validation.check_na(self.config_data[self.main_columns])
        self.config_data = self.config_data.loc[np.all(
            self.config_data[self.main_columns].notnull(), axis=1), :]

        # check all relationships in relationship column are valid
        self.validation.all_valid(self.given_relationships,
                                  self.valid_relationships,
                                  "a valid method of pd.Series, e.g. " +
                                  report_list(self.valid_relationships))

        # check for non-number conditions with numerical relationships
        invalid_num = (self.config_data["Relationship"].isin(
            ["gt", "ge", "le", "lt"]) &
            (pd.to_numeric(self.config_data["Condition"],
                           errors="coerce").isnull()))
        self.validation.flag_rows(invalid_num,
                                  flag_criteria="row(s) containing a numerical"
                                  + " relationship with non-number condition",
                                  flag_tier=Err)

        # check all prerequisite columns are also defined in configuration
        self.validation.all_valid(self.given_prereq, self.new_columns,
                                  "defined in the 'new column name' column " +
                                  "of the config file")

        self.validation.report(verbose=verbose)  # report
        if verbose == 5:
            self.describe()

        # return true only if there are zero errors
        return self.validation.is_valid()

[docs]    def describe(self):
        """Prints the mapping relationships in the Configuration object to
        console.

        Args:
            None

        Returns:
            None

        Examples:
            >>> MAP_PATH = "resources/mapping_configuration_files/"
            >>> EX_MAP_1 = pd.read_csv(MAP_PATH + "example_config_1.csv")
            >>> Configuration(EX_MAP_1).describe()
            MAPPING STATS
            <BLANKLINE>
             -   16 new columns produced ('AB_POSIT', 'AB_SIZE', 'AC_BRL', 'AC_CONV', 'AC_COUGH', etc)
             -   12 source columns required ('Id10403', 'Id10362', 'Id10169', 'Id10221', 'Id10154', etc)
             -   7 relationships invoked ('eq', 'lt', 'between', 'ge', 'contains', etc)
             -   13 conditions listed ('yes', '14', '10', '21', '15 to 49', etc)
             -   1 prerequisites checked ('FEMALE')

        """
        print("MAPPING STATS\n")
        spacer = " - \t"

        unique_checks = [("New Column Name", "new columns produced"),
                         ("Source Column ID", "source columns required"),
                         ("Relationship", "relationships invoked"),
                         ("Condition", "conditions listed"),
                         ("Prerequisite", "prerequisites checked")]

        for col_name, context in unique_checks:
            print(spacer, self.config_data[col_name].nunique(), context,
                  report_list(self.config_data[col_name].dropna().unique(),
                              limit=5))


[docs]class CrossVA():
    """Class representing raw VA data, and how to map it to an algorithm

    Attributes:
        mapping (type): a validated Configuration object that details how to
            transform the type of data in `raw_data` to the desired output.
        data (Pandas DataFrame): a Pandas DataFrame containing the raw VA data
        prepared_data (Pandas DataFrame): a Pandas DataFrame containing a
            prepared form of the VA data to use with the Configuration object.
        validation (Validation): Validation object containing the validation
            checks that have been made on the raw data and between the raw data
            and mapping Configuration.
        verbose (int): Controls verbosity of printing to console, 0-5 where 0
            is silent.
    """

    def __init__(self, raw_data, mapping_config, na_values=["dk", "ref", ""],
                 verbose=2):
        """Inits CrossVA class

        Args:
            raw_data (Pandas DataFrame): a Pandas DataFrame containing the
                raw data
            mapping_config (Configuration): a validated Configuration object
                that details how to transform the type of data in `raw_data`
                to the desired output.
            na_values (list): List of values to consider NA.
            verbose (int): Controls verbosity of printing to console. Defaults
                to 2.
        Returns:
            type: Description of returned object.

        Examples
            >>> MAP_PATH = "resources/mapping_configuration_files/"
            >>> EX_MAP_1 = pd.read_csv(MAP_PATH + "example_config_1.csv")
            >>> EX_DATA_1 = pd.read_csv("resources/sample_data/mock_data_2016WHO151.csv")
            >>> CrossVA(EX_DATA_1, Configuration(EX_MAP_1))
            <CrossVA with (4, 12) raw data and Validated Configuration instance
            with 16 mapping conditions>

        """
        # if mapping_config isn't valid, attempt to validate before aborting
        if not mapping_config.validation.is_valid():
            if not mapping_config.validate():
                pass
                # raise error - mapping_config should be a valid

        # Crop input data to only the columns which have a matching ID
        # at the end of their name using regex, and any additional columns
        # passed as an argument, and then rename dataframe to just the column
        # IDs expected from the mapping
        self.mapping = mapping_config
        new_columns = raw_data.columns.str.extract(
            "(" + "$|".join(self.mapping.source_columns.tolist()) + "$)",
            expand=False)
        cropped_data = raw_data.loc[:, new_columns.notnull()].copy()
        cropped_data = cropped_data.replace(na_values, np.nan)
        self.data = cropped_data.rename(columns=pd.Series(new_columns,
                                                          raw_data.columns))
        self.prepared_data = pd.DataFrame()
        self.verbose = verbose
        self.validation = Validation("Mapping-Data Relationship")

    def __str__(self):
        """ str representation method """
        return ""

    def __repr__(self):
        return ("<" + self.__class__.__name__ + " with " + str(self.data.shape)
                + " raw data and " + str(self.mapping) + ">")

[docs]    def process(self):
        """Applies the given configuration object's mappings to the given raw
        data.

        Args:   None

        Returns:
            Pandas DataFrame: a dataframe where the transformations specified
            have been applied to the raw data, resulting

        """
        if not self.validation.is_valid():
            if not self.validate():
                raise ValueError(("Can't process without valid"
                                  " CrossVA instance"))

        # Create empty dataframe with the list of columns given in mapping
        # If the new columns listed in the mapping have no definition (ie source,
        # relationship, and condition) then they will keep their default value
        # as NA.
        transformed_data = pd.DataFrame(index=np.arange(len(self.data)),
                                        columns=self.mapping.new_columns,
                                        dtype=float)
        transformed_data.columns.name = ""

        for condition in self.mapping.list_conditions():
            # this is the big transformation
            # create column_values (boolean pd.series) representing if
            # condition and preq is met in raw data for each row, preserving NA
            pre_req = condition.check_prereq(transformed_data)
            initial_condition = condition.check(self.prepared_data)

            # using 0 for false and 1 for true
            new_val = np.where((initial_condition == 0) | (pre_req == 0),
                               0,
                               pre_req * initial_condition)

            # update new column where new column is currently NA and new values
            # are True or False. If new column is False, overwrite if new value
            # is true. If new column is already True, do not overwrite.
            # This preserves NAs logically, creates implicit ANY relationship
            # between different conditions for the same new column
            transformed_data[condition.name] = np.sign(
                transformed_data[condition.name].add(new_val,
                                                     fill_value=0))

        return transformed_data

[docs]    def validate(self, verbose=None):
        """Validates that RawVAData's raw input data and its mapping
        configuration object are compatible and prepares input data for use.

        Args:
            verbose (int): int from 0 to 5, representing verbosity of printing
                to console. Defaults to None; if None, replaced with
                self.verbose attribute.

        Returns:
            boolean: True if valid, False if not.

        Examples:
            >>> MAP_PATH = "resources/mapping_configuration_files/"
            >>> EX_MAP_1 = pd.read_csv(MAP_PATH + "example_config_1.csv")
            >>> EX_DATA_1 = pd.read_csv("resources/sample_data/mock_data_2016WHO151.csv")
            >>> CrossVA(EX_DATA_1, Configuration(EX_MAP_1)).validate(verbose=0)
            True

        """
        if verbose is None:
            verbose = self.verbose

        if self.mapping.process_strings:
            # strip whitespace and replace non-trailing/leading with underscore
            # for str operation convenience
            self.data.fillna("NA", inplace=True)
            self.data = self.validation.fix_whitespace(self.data)
            # make all characters lowercase
            self.data = self.validation.fix_upcase(self.data)
            # strip for alphanumeric characters
            self.data = self.validation.fix_alnum(self.data)
            self.data = self.data.replace("NA", np.nan)

        # check all expected columns from config Source Column ID are present
        col_msg = "All expected columns from mapping file are present in data"
        self.validation.must_contain(self.data.columns.rename(
            "the input data columns"),
            self.mapping.source_columns.rename(
            "expected source column IDs listed in mapping file"),
            passing_msg=col_msg,
            fail=Warn)

        # warn about associated missing values
        flagged_missing = ~(self.mapping.config_data["Source Column ID"].isin(
            self.data.columns))

        if flagged_missing.sum() > 0:
            affected = self.mapping.config_data[flagged_missing].groupby(
                "Source Column ID")["New Column Name"].apply(list)
            self.validation.affected_by_absence(affected)

        self.validation.no_duplicates(self.data.columns)

        # TODO Check to see if relationship and conditions correspond to
        # logical values in each column
        if self.validation.is_valid():
            # add missing columns as NA
            self.data = self.data.reindex(columns=self.mapping.source_columns)
            # for mapping_condition in self.mapping.list_conditions():
            #     self.prepared_data[mapping_condition.source_dtype] = \
            #         mapping_condition.prepare_data(self.data)
            # get rid of: "PerformanceWarning: DataFrame is highly fragmented."
            dict_of_cols = {}
            for mapping_condition in self.mapping.list_conditions():
                dict_of_cols[mapping_condition.source_dtype] = \
                    mapping_condition.prepare_data(self.data)
            self.prepared_data = pd.concat([self.prepared_data,
                                            pd.DataFrame(
                                                dict_of_cols)],
                                           axis=1)

        self.validation.report(verbose)
        return self.validation.is_valid()


if __name__ == "__main__":
    import doctest
    doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)