Source code for validation

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Module containing Validation class, and Vcheck class and its subclasses
"""
from abc import ABCMeta, abstractmethod

import pandas as pd
import numpy as np
import re

from pycrossva.utils import report_list


[docs]class VCheck(metaclass=ABCMeta): """Abstract class fior a single validation check""" def __init__(self, message): """Inits VCheck class Args: message (str): the message associated with the validation check, which should describe why the check has passed or failed. Examples: >>> VCheck("Test Message") Traceback (most recent call last): TypeError: Can't instantiate abstract class VCheck with abstract methods bullet, level, tier, title """ self.message = message
[docs] def expand(self): """Expands VCheck information as a Pandas Series Args: None Returns: Pandas Series: representing VCheck attributes as a Pandas Series Examples: >>> Err("Error Message").expand() Tier Error Bullet [!] Level 1 Title ERRORS Message Error Message dtype: object """ return pd.Series([self.tier(), self.bullet(), self.level(), self.title(), self.message], ["Tier", "Bullet", "Level", "Title", "Message"])
@property @abstractmethod def tier(self): """abstract property, must be overriden. Should be str, representing name of VCheck tier""" return @property @abstractmethod def bullet(self): """abstract property, must be overriden. Should be a str, representing a bullet point""" return @property @abstractmethod def level(self): """abstract property, must be overriden. Should be int ,representing VCheck tier""" return @property @abstractmethod def title(self): """abstract property, must be overriden. Should be str, representing title of VCheck type""" return
[docs]class Err(VCheck): """VCheck subclass representing a serious problem in data validation that prevents validation. Examples: >>> Err("This is a data validation error").expand() Tier Error Bullet [!] Level 1 Title ERRORS Message This is a data validation error dtype: object """
[docs] def tier(self): return "Error"
[docs] def bullet(self): return "[!]"
[docs] def level(self): return 1
[docs] def title(self): return "ERRORS"
[docs]class Warn(VCheck): """VCheck subclass representing a problem in data validation that can be fixed in place, but would otherwise prevent validation. Examples: >>> Warn("This is a data validation warning").expand() Tier Warning Bullet [?] Level 2 Title WARNINGS Message This is a data validation warning dtype: object """
[docs] def tier(self): return "Warning"
[docs] def bullet(self): return "[?]"
[docs] def level(self): return 2
[docs] def title(self): return "WARNINGS"
[docs]class Suggest(VCheck): """VCheck subclass representing a minor problem with data that does not prevent data validation. Examples: >>> Suggest("This is a data validation suggestion").expand() Tier Suggestion Bullet [i] Level 3 Title SUGGESTIONS Message This is a data validation suggestion dtype: object """
[docs] def tier(self): return "Suggestion"
[docs] def bullet(self): return "[i]"
[docs] def level(self): return 3
[docs] def title(self): return "SUGGESTIONS"
[docs]class Passing(VCheck): """VCheck subclass representing a passed check in data validation, where there is no problem. Examples: >>> Passing("This is a passing data validation check").expand() Tier Passing Bullet [X] Level 4 Title CHECKS PASSED Message This is a passing data validation check dtype: object """
[docs] def tier(self): return "Passing"
[docs] def bullet(self): return "[X]"
[docs] def level(self): return 4
[docs] def title(self): return "CHECKS PASSED"
[docs]class Validation(): """Validation object represents an organized dataframe of validation checks Attributes: vchecks (Pandas DataFrame): a dataframe containing the expanded form of the VCheck instances that have been added. """ def __init__(self, name=""): """inits Validation class""" self.vchecks = pd.DataFrame() self.name = name def _add_condition(self, flagged_series, pass_check, fail_check): """Internal method that adds a fail_check to the self.vchecks attribute if any in flagged_series are True, else adds pass_check to the self.vchecks attribute. Both fail_check and pass_check are appended to self.vchecks through their .expand() method, which returns their information as a Pandas Series. Args: flagged_series (Pandas Series): a boolean Pandas Series, where True represents a failed condition that has been flagged. pass_check (VCheck): the VCheck to report if the check passes fail_check (VCheck): the VCheck to report if the check fails Returns: None Examples: >>> v = Validation() >>> v._add_condition(pd.Series([False, False, False]), Passing("Passed test"), Err("Failed test")) >>> v._add_condition(pd.Series([False, False, True]), Passing("Passed test"), Err("Failed test")) >>> v.vchecks Tier Bullet Level Title Message 0 Passing [X] 4 CHECKS PASSED Passed test 1 Error [!] 1 ERRORS Failed test """ if flagged_series.sum() > 0: # self.vchecks = self.vchecks.append(fail_check.expand(), # ignore_index=True) fail_check_expanded = pd.DataFrame(fail_check.expand()).T self.vchecks = pd.concat([self.vchecks, fail_check_expanded], ignore_index=True) else: # self.vchecks = self.vchecks.append(pass_check.expand(), # ignore_index=True) pass_check_expanded = pd.DataFrame(pass_check.expand()).T self.vchecks = pd.concat([self.vchecks, pass_check_expanded], ignore_index=True)
[docs] def no_duplicates(self, my_series): """ adds a validation check as `Err` if any items in my_series are duplicates. Intended to alert users of issues where there are duplicate columns before an exception is raised. my_series (Pandas Series): series where there should not be dupes Returns: None """ comparison = my_series.duplicated() passing_msg = ("Source column IDs do not match more than one column in" " input data.") fail_msg = (f"{comparison.sum()} source column IDs {report_list(my_series[comparison])}" " were found multiple times in the input data. Each source" " column ID should only occur once as part of an input data" " column name. It should be a unique identifier at" " the end of an input data column name. Source column IDs" " are case sensitive. Please revise your mapping configuration" " or your input data so that this condition is satisfied.") self._add_condition(my_series.duplicated(), Passing(passing_msg), Err(fail_msg))
[docs] def affected_by_absence(self, missing_grped): """ adds a validation check as `Warn` describing the items in missing_grped, which detail the impact that missing columns have on newly created mappings. missing_grped (Pandas Series): series where the index is the name of the missing source column, and the values are a list of affected values. Returns: None """ for source, affected_list in missing_grped.iteritems(): msg = (f"'{source}' is missing, which affects the creation of " f" column(s) {report_list(affected_list, paren=False)}") # self.vchecks = self.vchecks.append(Warn(msg).expand(), # ignore_index=True) warn_expanded = pd.DataFrame(Warn(msg).expand()).T self.vchecks = pd.concat([self.vchecks, warn_expanded], ignore_index=True)
[docs] def must_contain(self, given, required, passing_msg="", fail=Err): """adds a validation check where `given` must contain every item in `required` at least once to pass, and `fail_check` is `fail`, (fails validation). Args: given (Pandas Series): the items representing input given required (Pandas Series): the items required to be in `given` passing_msg (str): Message to return if all items in `expected` are listed in `given`. Defaults to "". fail (VCheck): the outcome if the check fails. Default is Err. impact (Pandas Series): a corresponding series to `required` that represents the affected information when Returns: None Examples: >>> v = Validation() >>> v.must_contain(pd.Series(["a","b","c"], name="example input"), pd.Series(["a","b"], name="example requirement(s)"), "all included") >>> v.must_contain(pd.Series(["a","b","c"], name="example input"), pd.Series(["a","b","d"], name="example requirement(s)")) >>> v.report(verbose=4) Validating . . . <BLANKLINE> CHECKS PASSED [X] all included <BLANKLINE> ERRORS [!] 1 (33.3%) example requirement(s) ('d') were not found in example input. Their values will be NA. """ # Comparison is true (fails) when an item in required isn't in given comparison = ~required.isin(given) percentage = '{0:.1%}'.format(comparison.sum() / comparison.size) fail_msg = " ".join([str(comparison.sum()), '(' + percentage + ')', str(required.name), report_list(required[comparison]), "were not found in", str(given.name) + ".", "Their values will be NA."]) self._add_condition(comparison, Passing(passing_msg), fail(fail_msg))
[docs] def no_extraneous(self, given, relevant, value_type): """adds a validation check where all values in `given` should also be in `relevant` to pass. `fail_check` is `Warn` Args: given (Pandas Series): the items representing input given relevant (Pandas Series): all items in `given` that will be used value_type (str): string describing the kind of noun that is listed in `given` Returns: None Examples: >>> v = Validation() >>> v.no_extraneous(pd.Series(["a","b"], name="example input"), pd.Series(["a","b","c"], name="relevant value(s)"), "example") >>> v.no_extraneous(pd.Series(["a","b","c"], name="example input"), pd.Series(["a","d"], name="relevant value(s)"), "example") >>> v.report(verbose=4) Validating . . . <BLANKLINE> CHECKS PASSED [X] No extraneous example found in example input. <BLANKLINE> ERRORS [!] 2 extraneous example(s) found in example input ('b', and 'c') Extraneous example(s) will be ommitted. """ # comparison is true (fails) when an item in `given` isn't in # `relevant` comparison = ~given.isin(relevant) fail_msg = " ".join([str(comparison.sum()), "extraneous", value_type + "(s)", "found in", str(given.name), report_list(given[comparison], limit=5), "Extraneous", value_type + "(s)", "will be ommitted."]) passing_msg = " ".join(["No extraneous", value_type, "found in", str(given.name) + "."]) self._add_condition(comparison, Passing(passing_msg), Err(fail_msg))
[docs] def all_valid(self, given, valid, definition): """adds a validation check where all values in `given` must be in `valid` to pass. `fail_check` is `Err` (fails validation). Args: given (Pandas Series): the items representing input given valid (Pandas Series): list of all possible valid items accepted in `given` definition (str): string describing what makes an item in `given` be in `valid` Returns: None Examples: >>> v = Validation() >>> v.all_valid(pd.Series(["a","b"], name="example input"), pd.Series(["a","b","c"], name="valid value(s)"), "pre-defined") >>> v.all_valid(pd.Series(["a","b","c"], name="example input"), pd.Series(["a","d"], name="valid value(s)"), "'a' or 'd'") >>> v.report(verbose=4) Validating . . . <BLANKLINE> CHECKS PASSED [X] All values in example input are valid. <BLANKLINE> ERRORS [!] 2 values in example input were invalid ('b', and 'c'). These must be 'a' or 'd' to be valid. """ # comparison is true (fails) when an item in `given` isn't in `valid` comparison = ~given.isin(valid) passing_msg = " ".join(["All values in", str(given.name), "are valid."]) fail_msg = " ".join([str((comparison).sum()), "values in", str(given.name), "were invalid", report_list( given[(comparison)]) + ".", "These must be", definition, "to be valid."]) self._add_condition(comparison, Passing(passing_msg), Err(fail_msg))
[docs] def flag_elements(self, flag_where, flag_elements, criteria): """Adds a validation check seeing if any values in flag_where are true, and then reports on the corresponding items in flag_elements. Args: flag_where (Pandas Series): a boolean Pandas Series where True represents a failed check flag_elements (Pandas Series): a boolean Pandas Series listing elements that are affected by True values in `flag_where` criteria (String): a brief description of what elements are being flagged and reported on Returns: None Examples: >>> v = Validation("element test") >>> v.flag_elements(pd.Series([False, False]), pd.Series(["A", "B"]), "red flag(s)") >>> v.flag_elements(pd.Series([False, True]), pd.Series(["A", "B"]), "blue flag(s)") >>> v.report(verbose=4) Validating element test . . . <BLANKLINE> CHECKS PASSED [X] No red flag(s) in element test detected. <BLANKLINE> WARNINGS [?] 1 blue flag(s) in element test detected. These ('B') will be treated as NA. """ passing_msg = f"No {criteria} in {self.name} detected." fail_msg = " ".join([str(flag_where.sum()), criteria, "in", self.name, "detected.", "These", report_list(flag_elements[flag_where]), "will be treated as NA."]) self._add_condition(flag_where, Passing(passing_msg), Warn(fail_msg))
[docs] def flag_rows(self, flag_where, flag_criteria, flag_action="", flag_tier=Warn): """Adds a validation check seeing if any values in flag_where are true, where fail_check is of type flag_tier. Note that rows are reported counting from 0. Args: flag_where (Pandas Series): a boolean Pandas Series where True represents a failed check. flag_criteria (str): a noun clause describing the criteria for an item to be flagged in `flag_where` flag_action (str): string describing the action to be taken if an item is flagged. Defaults to "". flag_tier (VCheck): should be either Suggest, Warn, or Err, is the seriousness of the failed check. Returns: None Examples: >>> v = Validation() >>> v.flag_rows(pd.Series([False, False]), flag_criteria="true values") >>> v.flag_rows(pd.Series([False, True]), flag_criteria="true values") >>> v.report(verbose=4) Validating . . . <BLANKLINE> CHECKS PASSED [X] No true values detected. <BLANKLINE> WARNINGS [?] 1 true values detected in row(s) #1. """ passing_msg = " ".join(["No", flag_criteria, "detected."]) fail_msg = " ".join([str(flag_where.sum()), flag_criteria, "detected in row(s)", report_row( flag_where) + ".", flag_action]) self._add_condition(flag_where, Passing(passing_msg), flag_tier(fail_msg))
def _check_df(self, df, condition, flag_criteria, flag_action="", flag_tier=Warn): """Adds a validation check flagging the rows in every column of `df` where applying the function condition changes the value of the element. Passes flag_criteria, flag_action and flag_tier on to flag_rows(). Args: df (Pandas DataFrame): a Pandas DataFrame where each column should be checked condition (function): a function that corrects possible errors in each column, but does not change elements which are already fine. flag_criteria (str): a noun clause describing the criteria for an item to be changed by `condition` flag_action (str): string describing the action to be taken if an item is flagged. Defaults to "". flag_tier (VCheck): should be either Suggest, Warn, or Err, is the seriousness of the failed check. Returns: Pandas DataFrame: the Pandas DataFrame where condition() has been applied to every column. Examples: >>> v = Validation() >>> test_df = pd.DataFrame({"A":["a","B","c"], "B":["D","e","F"]}) >>> v._check_df(test_df, str.lower, flag_criteria="lowercase char") A B 0 a d 1 b e 2 c f >>> v._check_df(test_df, str.upper, flag_criteria="uppercase char") A B 0 A D 1 B E 2 C F >>> v.report(verbose=4) Validating . . . <BLANKLINE> WARNINGS [?] 1 lowercase char column A detected in row(s) #1. [?] 2 lowercase char column B detected in row(s) #0, and #2. [?] 2 uppercase char column A detected in row(s) #0, and #2. [?] 1 uppercase char column B detected in row(s) #1. """ df = df.copy() for name, aseries in df.iteritems(): applied = aseries.apply(condition) self.flag_rows(applied.ne(aseries), flag_criteria + " column " + str(name), flag_action, flag_tier) df.loc[:, name] = applied return df
[docs] def check_na(self, df): """Adds a validation check flagging the rows in every column of `df` that are `None` Args: df (Pandas DataFrame): a Pandas DataFrame with columns that should have no NA values Returns: None Examples: >>> v = Validation() >>> test_df = pd.DataFrame({"A":["a","B","c"], "B":["D","e",None]}) >>> v.check_na(test_df) >>> v.report(verbose=4) Validating . . . <BLANKLINE> CHECKS PASSED [X] No NA's in column A detected. <BLANKLINE> WARNINGS [?] 1 NA's in column B detected in row(s) #2. """ self._check_df(df, condition=lambda x: "" if x is None else x, flag_criteria="NA's in", flag_tier=Warn )
[docs] def fix_whitespace(self, df): """Adds a validation check flagging the rows in every column of `df` that contain whitespace Args: df (Pandas DataFrame): a Pandas DataFrame with columns that should have no whitespace Returns: Pandas DataFrame: `df` where whitespace is replaced with an underscore Examples: >>> v = Validation() >>> test_df = pd.DataFrame({"A":["a"," B ","Test Data"], "B":["D"," e","F "]}) >>> v.fix_whitespace(test_df) A B 0 a D 1 B e 2 Test_Data F >>> v.report(verbose=4) Validating . . . <BLANKLINE> CHECKS PASSED [X] No whitespace in column B detected. <BLANKLINE> WARNINGS [?] 1 leading/trailing spaces column A detected in row(s) #1. Leading/trailing spaces will be removed. [?] 2 leading/trailing spaces column B detected in row(s) #1, and #2. Leading/trailing spaces will be removed. [?] 1 whitespace in column A detected in row(s) #2. Whitespace will be converted to '_' """ stripped_df = self._check_df(df.fillna("").astype(str), str.strip, flag_criteria="leading/trailing " "spaces", flag_action="Leading/trailing spaces " "will be removed.") # pass stripped_df to check_df with a regex expression to replace # remaining whitespace with underscores, except for the " to " # construction return self._check_df(stripped_df.fillna("").astype(str), lambda x: re.sub(r"(?<!to)\s(?!to)", "_", x), flag_criteria="whitespace in", flag_action="Whitespace will be converted to '_'" )
[docs] def fix_alnum(self, df): """Adds a validation check flagging the rows in every column of `df` that contain non-alphanumeric characters. Regex removes all characters that are not alpha-numeric, but leaves periods that are part of a number. Args: df (Pandas DataFrame): a Pandas DataFrame with columns that should have only alphanumeric characters Returns: Pandas DataFrame: `df` where alphanumeric characters are removed Examples: >>> v = Validation() >>> test_df = pd.DataFrame({"A":["a","3.0","c"], "B":["??.test","test<>!",";test_data"]}) >>> v.fix_alnum(test_df) A B 0 a test 1 3.0 test 2 c test_data >>> v.report(verbose=4) Validating . . . <BLANKLINE> CHECKS PASSED [X] No non-alphanumeric value(s) in column A detected. <BLANKLINE> WARNINGS [?] 3 non-alphanumeric value(s) in column B detected in row(s) #0, #1, and #2. This text should be alphanumeric. Non-alphanumeric characters will be removed. """ return self._check_df(df, lambda x: re.sub(r"[^a-zA-Z0-9_ -\.]|\.(?!\d)|\!", r"", str(x)), flag_criteria="non-alphanumeric value(s) in", flag_action="This text should be alphanumeric. " "Non-alphanumeric characters will be removed." )
[docs] def fix_lowcase(self, df): """Adds a validation check flagging the rows in every column of `df` that contain lowercase characters. Args: df (Pandas DataFrame): a Pandas DataFrame with columns that should have only uppercase characters Returns: Pandas DataFrame: `df` where all characters are uppercase Examples: >>> v = Validation() >>> test_df = pd.DataFrame({"A":["a","B","c"], "B":["D","e","F"]}) >>> v.fix_lowcase(test_df) A B 0 A D 1 B E 2 C F >>> v.report(verbose=4) Validating . . . <BLANKLINE> WARNINGS [?] 2 lower case value(s) in column A detected in row(s) #0, and #2. Convention to have this text be uppercase. Lower case text will be made uppercase. [?] 1 lower case value(s) in column B detected in row(s) #1. Convention to have this text be uppercase. Lower case text will be made uppercase. """ return self._check_df(df.astype(str), lambda x: x.upper(), flag_criteria="lower case value(s) in ", flag_action="Convention to have this text be " "uppercase. Lower case text will be made " "uppercase.")
[docs] def fix_upcase(self, df): """Adds a validation check flagging the rows in every column of `df` that contain uppercase characters Args: df (Pandas DataFrame): a Pandas DataFrame with columns that should have only lowercase characters Returns: Pandas DataFrame: `df` where all characters are lowercase Examples: >>> v = Validation() >>> test_df = pd.DataFrame({"A":["a","B","c"], "B":["D","e","F"]}) >>> v.fix_upcase(test_df) A B 0 a d 1 b e 2 c f >>> v.report(verbose=4) Validating . . . <BLANKLINE> WARNINGS [?] 1 upper case value(s) in column A detected in row(s) #1. Convention is to have this text be lowercase. Upper case text will be made lowercase. [?] 2 upper case value(s) in column B detected in row(s) #0, and #2. Convention is to have this text be lowercase. Upper case text will be made lowercase. """ return self._check_df(df.astype(str), lambda x: x.lower(), flag_criteria="upper case value(s) in", flag_action="Convention is to have this text be " "lowercase. Upper case text will be made" " lowercase.")
[docs] def is_valid(self): """Checks to see if instance is valid. Args: None Returns: bool: True if is valid (has no errors in vchecks) and False if instance has errors or where vchecks is empty. Examples: >>> Validation().is_valid() False >>> v = Validation() >>> v.must_contain(pd.Series(["A", "B"]), pd.Series(["B"])) >>> v.is_valid() True >>> v.must_contain(pd.Series(["A", "B"]), pd.Series(["C"])) >>> v.is_valid() False """ if self.vchecks.empty: return False return (self.vchecks["Tier"] == "Error").sum() == 0
[docs] def report(self, verbose=2): """Prints the checks in the vchecks attribute Args: verbose (int): Parameter controlling how much to print by filtering for the level in each vchecks row to be less than or equal to verbose. Defaults to 2 (print only converted `Warn` and `Err` checks) Returns: None Examples: >>> v = Validation("Testing Tests") >>> v._add_condition(pd.Series([False, False, False]), Passing("Passed test"), Err("Failed test")) >>> v._add_condition(pd.Series([False, False, False]), Passing("Passed test 2"), Err("Failed test")) >>> v._add_condition(pd.Series([False, False, True]), Passing("Passed test"), Err("Error test")) >>> v._add_condition(pd.Series([False, False, True]), Passing("Passed test"), Warn("Warn test")) >>> v._add_condition(pd.Series([False, False, True]), Passing(""), Suggest("Suggest test")) >>> v.report(verbose=1) Validating Testing Tests . . . <BLANKLINE> ERRORS [!] Error test >>> v.report(verbose=4) Validating Testing Tests . . . <BLANKLINE> CHECKS PASSED [X] Passed test [X] Passed test 2 <BLANKLINE> ERRORS [!] Error test <BLANKLINE> SUGGESTIONS [i] Suggest test <BLANKLINE> WARNINGS [?] Warn test """ if self.vchecks.empty: print("No validation checks made.") return within_verbose = self.vchecks[self.vchecks["Level"] <= verbose] if not within_verbose.empty: print(f"Validating {self.name} . . .") final_reports = within_verbose.groupby("Title") for title in final_reports.groups: print("\n", title) for i, single_report in final_reports.get_group(title).iterrows(): print(single_report["Bullet"], "\t", single_report["Message"])
[docs]def report_row(flag_where): """A helper method to return an english explanation of what rows have been flagged with a failed validation check. Args: flag_where (Pandas Series): boolean Pandas Series representing failed validation checks. Returns: str: a string reporting the index of the flagged rows Examples: >>> report_row(pd.Series([True, True, False, True, False])) '#0, #1, and #3' """ flagged = flag_where[flag_where.fillna(False)] if flagged.index.is_numeric(): unformatted_report = report_list(flagged.index.tolist(), paren=False) return re.sub(r"'(\d+)'", r"#\1", unformatted_report) return report_list(flagged.index.tolist(), paren=False)
if __name__ == "__main__": import doctest doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)