Source code for transform

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Defines main CrossVA function, `transform` which maps raw VA data into data
for use with a VA algorithm in OpenVA.
"""
import sys
import os

import pandas as pd
import numpy as np

from pycrossva.configuration import Configuration, CrossVA
from pycrossva.utils import flexible_read

SUPPORTED_INPUTS = ["2016WHOv151", "2016WHOv141", "2012WHO",
                    "2021WHO", "PHRMCShort"]
SUPPORTED_OUTPUTS = ["InterVA5", "InterVA4", "InSilicoVA"]


[docs]def transform(mapping, raw_data, raw_data_id=None, verbose=2, preserve_na=True,
              result_values={"Present": "y", "Absent": "n", "NA": "."}):
    """transforms raw VA data (`raw_data`) into data suitable for use with a VA
    algorithm, according to the specified transformations given in `mapping`.

    Args:
        mapping (string, tuple or Pandas DataFrame): Should be either a tuple
            in form (input, output), a path to csv containing a configuration
            data file, or a Pandas DataFrame containing configuration data
        raw_data (string or Pandas DataFrame): raw verbal autopsy data to
            process
        raw_data_id (string): column name with record ID
        verbose (int): integer from 0 to 5, controlling how much status detail
            is printed to console. Silent if 0. Defaults to 2, which will print
            only errors and warnings.
        preserve_na (bool): whether to preserve NAs in data, or to count them
            as FALSE. Overridden with True for InSilicoVA, False for InterVA4
            when mapping is given as a tuple. Defaults to TRUE, which allows
            NA values to perpetuate through the data.
        result_values (dict): available as a simple customization option if
            user would like values indicating presence, absence, and NAs to
            be mapped to certain values.

    Returns:
        Pandas DataFrame: the raw data transformed according to specifications
        given in mapping data. Default values are y where symptom is present,
        n where symptom is absent, and if . are preserved, they are represented
        in the data as NaNs. If NAs are not preserved, they are considered to be
        false / absent / 0.

    Examples:
        You can specify the mapping as ('input', 'output') and the path to csv
        as a string:

        >>> transform(("2016WHOv151", "InterVA4"), "resources/sample_data/2016WHO_mock_data_1.csv").loc[range(5),["ACUTE","CHRONIC","TUBER"]]
           ACUTE  CHRONIC  TUBER
        0      y        n      .
        1      y        n      .
        2      n        y      .
        3      n        y      .
        4      y        n      .

        You can also give the data and mapping as Pandas DataFrames:

        >>> my_special_data = pd.read_csv("resources/sample_data/2016WHO_mock_data_1.csv")
        >>> my_special_mapping = pd.read_csv("resources/mapping_configuration_files/2016WHOv151_to_InSilicoVA.csv")
        >>> transform(my_special_mapping, my_special_data).loc[range(5),["ACUTE","CHRONIC","TUBER"]]
           ACUTE  CHRONIC  TUBER
        0      y        n      .
        1      y        n      .
        2      n        y      .
        3      n        y      .
        4      y        n      .

        Note that by default, `preserve_na` is `True` and NA values will be
        left in. If `preserve_na` is `False`, or if the algorithm does not
        preserve NAs, then NA values will be filled in as 0's, as they are in
        the first InterVA4 example above.

        The user can also pass in a different mapping dictionary for
        result_values to change the values from their defaults of
        0 (False / Absent), 1 (True / Present), and NaN
        (No data / missing), if they need their results in a different format.

        >>> transform(("2016WHOv151", "InterVA4"), "resources/sample_data/2016WHO_mock_data_1.csv", result_values={"Absent":"A","Present":"P","NA":"Missing"}).loc[range(5),["ACUTE","CHRONIC","TUBER"]]
          ACUTE CHRONIC    TUBER
        0     P       A  Missing
        1     P       A  Missing
        2     A       P  Missing
        3     A       P  Missing
        4     P       A  Missing

        The mapping-data relationship is designed to be as flexible as possible,
        while still emphasizing tracebility and alerting the user to data
        integrity issues.

        Not every source column in the mapping needs to be
        represented in the data. If source columns are missing in the source data,
        then those columns will be created and filled with NA values.

        >>> transform(("2016WHOv151", "InSilicoVA"), "resources/sample_data/2016WHO_mock_data_2.csv").loc[range(5),["ACUTE","FEMALE","MARRIED"]]
        Validating Mapping-Data Relationship . . .
        <BLANKLINE>
         WARNINGS
        [?]          3 (1.3%) expected source column IDs listed in mapping file ('-ageInDaysNeonate', '-Id10019', and '-Id10059') were not found in the input data columns. Their values will be NA.
        [?]          '-Id10019' is missing, which affects the creation of  column(s) 'FEMALE', and 'MALE'
        [?]          '-Id10059' is missing, which affects the creation of  column(s) 'MARRIED'
        [?]          '-ageInDaysNeonate' is missing, which affects the creation of  column(s) 'DIED_D1', 'DIED_D23', 'DIED_D36', 'DIED_W1', and 'NEONATE'
           ACUTE  FEMALE  MARRIED
        0      y       .        .
        1      y       .        .
        2      y       .        .
        3      y       .        .
        4      y       .        .

        `transform` will also accept mapping configurations with missing values,
        with new columns that are specified but missing source columns.
        These new columns will be created so that the final result has the correct
        expeted columns for the algorithm, but filled with NA values to indicate
        the lack of information. If `preserve_na` is set to `False`, then the NA
        values will also be `False`.

        This situation is common between certain questionnaire sources and algorithms.
        For example, in the mapping between the PHRMC Short questionnaire
        to InterVA5 mapping, there are 107 InterVA5
        variables that are listed in the mapping configuration to be created,
        but have no corresponding question in PHRMC short.

        For example, variables i004a and i004b have no specifications in the
        mapping below. They are still listed under "New Column Name" so
        CrossVA knows that they should be created in the final result, but
        because they have no logic defined, they will be left as their default
        value of NA.

        >>> phrmc_to_interva5 = pd.read_csv('resources/mapping_configuration_files/PHRMCShort_to_InterVA5.csv')
        >>> phrmc_to_interva5.iloc[:5,[0,2,4,-1]]
          New Column Name Source Column ID Relationship Meta: Notes
        0           i004a              NaN          NaN   Not asked
        1           i004b              NaN          NaN   Not asked
        2           i019a          gen_5_2           eq         NaN
        3           i019b          gen_5_2           eq         NaN
        4           i022a         gen_5_4h           ge         NaN

        The `transform` function will warn the user of this behavior.

        >>> transform(phrmc_to_interva5, "resources/sample_data/PHRMC_mock_data_1.csv").iloc[:5,:5]
        Validating Mapping Configuration . . .
        <BLANKLINE>
         WARNINGS
        [?] 	 124 new column(s) listed but not defined in Mapping Configuration detected. These ('i004a', 'i004b', 'i059o', 'i082o', 'i087o', 'i091o', 'i092o', 'i093o', 'i094o', 'i095o', etc) will be treated as NA.
        Validating Mapping-Data Relationship . . .
        <BLANKLINE>
         WARNINGS
        [?] 	 9 (5.7%) expected source column IDs listed in mapping file ('child_6_2', 'child_4_4', 'child_4_20', 'child_4_7a', 'child_4_40', 'child_4_28', 'child_4_30', 'child_1_5a', and 'child_5_1') were not found in the input data columns. Their values will be NA.
        [?] 	 'child_1_5a' is missing, which affects the creation of  column(s) 'i358a'
        [?] 	 'child_4_20' is missing, which affects the creation of  column(s) 'i171o'
        [?] 	 'child_4_28' is missing, which affects the creation of  column(s) 'i208o'
        [?] 	 'child_4_30' is missing, which affects the creation of  column(s) 'i233o'
        [?] 	 'child_4_4' is missing, which affects the creation of  column(s) 'i150a'
        [?] 	 'child_4_40' is missing, which affects the creation of  column(s) 'i200o'
        [?] 	 'child_4_7a' is missing, which affects the creation of  column(s) 'i183o'
        [?] 	 'child_5_1' is missing, which affects the creation of  column(s) 'i418o'
        [?] 	 'child_6_2' is missing, which affects the creation of  column(s) 'i130o'
           ID  i004a  i004b  i019a  i019b
        0   1      .      .      y      n
        1   2      .      .      n      n
        2   3      .      .      n      n
        3   4      .      .      y      n
        4   5      .      .      n      n

        However, the mapping-data relationship must be valid. For example, if
        the source column IDs are not unique for the input data - that is,
        if multiple columns in the input data contain the same source ID - then
        validation will fail.

        For example, `bad_data` contains columns named `A-Id10004` and `B-Id10004`,
        but the 2016 WHO mapping is looking for just `-Id10004` as a source ID.
        CrossVA cannot tell which column should be used, so validation fails.

        >>> bad_data = pd.read_csv("resources/sample_data/2016WHO_bad_data_1.csv")
        >>> transform(("2016WHOv151", "InSilicoVA"), bad_data)
        Validating Mapping-Data Relationship . . .
        <BLANKLINE>
         ERRORS
        [!]      1 source column IDs ('-Id10004') were found multiple times in the input data. Each source column ID should only occur once as part of an input data column name. It should be a unique identifier at the end of an input data column name. Source column IDs are case sensitive. Please revise your mapping configuration or your input data so that this condition is satisfied.

    """
    mapping_data = pd.DataFrame()

    # read in mapping data
    if isinstance(mapping, tuple):  # if mapping is in (input, output) format
        internal_path = os.path.join(os.path.split(
            __file__)[0], "resources/mapping_configuration_files/")
        if len(mapping) == 2:
            if mapping[0] in SUPPORTED_INPUTS:
                if mapping[1] in SUPPORTED_OUTPUTS:
                    preserve_na = mapping[1] == "InSilicoVA"  # overides given
                    if mapping[1] == "InterVA4":
                        # treat as Insillico w/o NA
                        mapping = (mapping[0], "InSilicoVA")

                    expected_filename = (f"{internal_path}"
                                         f"{mapping[0]}_to_"
                                         f"{mapping[1]}.csv")
                    if os.path.isfile(expected_filename):
                        mapping_data = pd.read_csv((f"{internal_path}"
                                                    f"{mapping[0]}_to_"
                                                    f"{mapping[1]}.csv"))
                    else:
                        raise ValueError((f"No mapping supporting {mapping[0]} "
                                          f"to {mapping[1]} currently exists."))

                else:
                    raise ValueError(("Output not supported. Expected one of "
                                      f"{supported_outputs}, but received "
                                      f"'{mapping[1]}'"))
            else:
                raise ValueError(("Input not supported. Expected one of "
                                  f"{SUPPORTED_INPUTS}, but received "
                                  f"'{mapping[0]}'"))
        else:
            raise ValueError(("If mapping is tuple, input should be of length "
                              "two in the form (input type, output type)"))

    else:
        mapping_data = flexible_read(mapping)

    if mapping_data.empty:  # this shouldn't happen; if it does, raise
        raise ValueError(("No valid mapping data provided to transform. Should be"
                          " either a tuple in form (input, output), a path to csv"
                          " or a Pandas DataFrame."))

    # init configuration obj from given mapping data
    config = Configuration(config_data=mapping_data,
                           verbose=verbose,
                           process_strings=False)

    # if the configuration isn't valid, or if the data isn't valid for the
    # config file, then raise error
    if not config.validate(verbose=verbose):
        # Raise Error
        raise ValueError(("Configuration from mapping file must be valid "
                          "before transform."))

    # TODO adds args to init based on data type?
    input_data = flexible_read(raw_data)
    cross_va = CrossVA(input_data, config)
    if not cross_va.validate(verbose=verbose):
        return
#        raise ValueError(("Cannot transform if provided raw data is not valid "
#                          "for configuration file."))
    final_data = cross_va.process()
    # if result values have been changed, then map as directed, otherwise
    # leave alone - the default values are what we actually have, so we don't
    # need to do any mapping if they have not specified an alternative.

    defaults = {"Present": 1, "Absent": 0, "NA": np.nan}
    if result_values != defaults:
        actual_mapping = {value: result_values[key] for key,
                          value in defaults.items()}
        final_data = final_data.replace(actual_mapping)
    if raw_data_id is not None:
        try:
            final_data.insert(loc=0,
                              column="ID",
                              value=input_data[raw_data_id])
        except KeyError:
            raise ValueError((f"Could not find column named {raw_data_id} "
                              "in raw_data."))
    else:
        final_data.reset_index(inplace=True)
        final_data.rename(columns={"index": "ID"}, inplace=True)
        final_data["ID"] = final_data["ID"] + 1


    if preserve_na:
        return final_data
    return final_data.fillna(0)


if __name__ == "__main__":
    import doctest
    doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)