## ----include = FALSE---------------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.width = 7, fig.height = 5 ) ## ----setup-------------------------------------------------------------------- library(leakr) ## ----detector_overview-------------------------------------------------------- # View available detectors available_detectors <- list_registered_detectors() print(available_detectors) ## ----medical_example---------------------------------------------------------- # Create a medical dataset with subtle leakage set.seed(456) n <- 500 medical_data <- data.frame( patient_id = 1:n, age = sample(25:75, n, replace = TRUE), bmi = rnorm(n, 25, 4), blood_pressure = rnorm(n, 120, 15), diagnosis = factor(sample(c("healthy", "diseased"), n, replace = TRUE, prob = c(0.8, 0.2))) ) # Add a leaky feature: treatment_received (only available post-diagnosis) medical_data$treatment_received <- ifelse( medical_data$diagnosis == "diseased", sample(c("yes", "no"), sum(medical_data$diagnosis == "diseased"), replace = TRUE, prob = c(0.9, 0.1)), "no" ) # Audit the medical data medical_report <- leakr_audit( data = medical_data, target = "diagnosis", id = "patient_id" ) print(medical_report) ## ----financial_example-------------------------------------------------------- # Create financial data with temporal leakage set.seed(789) dates <- seq(as.Date("2020-01-01"), as.Date("2023-12-31"), by = "month") financial_data <- data.frame( account_id = 1:200, transaction_date = sample(dates, 200, replace = TRUE), amount = rlnorm(200, 4, 1), account_balance = rnorm(200, 1000, 500), default_risk = factor(sample(c("low", "high"), 200, replace = TRUE)) ) # Sort by date financial_data <- financial_data[order(financial_data$transaction_date), ] # Add feature that uses future information (credit score after default assessment) financial_data$credit_score_updated <- ifelse( financial_data$default_risk == "high", rnorm(sum(financial_data$default_risk == "high"), 450, 50), rnorm(sum(financial_data$default_risk == "low"), 750, 75) ) # Create temporal split financial_data$split <- ifelse( financial_data$transaction_date < as.Date("2022-01-01"), "train", "test" ) # Audit financial data financial_report <- leakr_audit( data = financial_data, target = "default_risk", split = "split", id = "account_id" ) print(financial_report) ## ----customer_duplicates------------------------------------------------------ # Create customer dataset with near-duplicates set.seed(321) # Original customers customers <- data.frame( name = c("John Smith", "Jane Doe", "Bob Johnson", "Alice Brown", "Charlie Davis"), email = c("john@email.com", "jane@email.com", "bob@email.com", "alice@email.com", "charlie@email.com"), age = c(35, 28, 42, 31, 39), income = c(50000, 45000, 75000, 55000, 62000), purchase_category = factor(c("electronics", "books", "clothing", "electronics", "books")) ) # Create near-duplicates with slight variations near_dupes <- customers[1:3, ] near_dupes$name <- c("J Smith", "Jane D", "Robert Johnson") # Name variations near_dupes$email <- c("john.smith@email.com", "j.doe@email.com", "bob.johnson@email.com") # Email variations near_dupes$age <- near_dupes$age + c(1, 0, -1) # Age variations # Combine datasets all_customers <- rbind(customers, near_dupes) all_customers$customer_id <- 1:nrow(all_customers) # Audit for duplicates dup_report <- leakr_audit( data = all_customers, target = "purchase_category", id = "customer_id" ) print(dup_report) ## ----configuration------------------------------------------------------------ # Example of custom configuration for sensitive detection sensitive_config <- list( sample_size = 5000, # Limit sample size for large datasets correlation_threshold = 0.7, # Lower threshold for correlation-based detection duplicate_threshold = 0.9 # Threshold for considering records as duplicates ) # Apply custom configuration iris_sensitive <- leakr_audit( data = iris, target = "Species", config = sensitive_config ) print(iris_sensitive) ## ----large_dataset------------------------------------------------------------ # Create a large imbalanced dataset set.seed(555) large_n <- 10000 large_data <- data.frame( feature1 = rnorm(large_n), feature2 = sample(letters[1:10], large_n, replace = TRUE), feature3 = rnorm(large_n, 100, 20), # Imbalanced target target = factor(sample(c("rare", "common"), large_n, replace = TRUE, prob = c(0.05, 0.95))) ) # Use stratified sampling to ensure representation sample_indices <- stratified_sample(large_data$target, 1000) sampled_data <- large_data[sample_indices, ] # Verify sampling maintained class balance table(large_data$target) table(sampled_data$target) # Audit sampled data large_report <- leakr_audit( data = sampled_data, target = "target" ) print(large_report) ## ----detailed_analysis-------------------------------------------------------- # Create complex dataset for comprehensive analysis complex_data <- data.frame( id = 1:300, timestamp = seq(as.POSIXct("2023-01-01"), as.POSIXct("2023-12-31"), length.out = 300), feature_a = rnorm(300), feature_b = sample(LETTERS[1:5], 300, replace = TRUE), feature_c = rnorm(300, 50, 10), outcome = factor(sample(c("success", "failure"), 300, replace = TRUE)) ) # Add intentional leakage for demonstration complex_data$leaky_feature <- ifelse(complex_data$outcome == "success", 1, 0) # Generate comprehensive audit detailed_report <- leakr_audit( data = complex_data, target = "outcome", id = "id" ) # Generate detailed summary detailed_summary <- leakr_summarise(detailed_report, top_n = 10, show_config = TRUE) print(detailed_summary) ## ----multi_stage-------------------------------------------------------------- # Multi-stage validation function comprehensive_validation <- function(data, target, id = NULL, split = NULL) { cat("Stage 1: Basic data validation\n") # Basic preprocessing and validation clean_data <- validate_and_preprocess_data(data, target, split, id) cat("Stage 2: Initial leakage screening\n") # Quick initial screening initial_report <- leakr_audit(clean_data, target = target, split = split, id = id) cat("Stage 3: Detailed analysis\n") # Generate detailed summary summary_report <- leakr_summarise(initial_report, top_n = 15, show_config = TRUE) # Count critical issues if(length(initial_report$issues) > 0) { critical_count <- sum(sapply(initial_report$issues, function(x) !is.null(x$severity) && x$severity == "high")) if(critical_count > 0) { cat("WARNING:", critical_count, "critical issues detected!\n") } } return(list( data = clean_data, audit = initial_report, summary = summary_report )) } # Example usage # validation_result <- comprehensive_validation(your_data, "target_column") ## ----domain_specific---------------------------------------------------------- # Example: E-commerce specific validation ecommerce_validation <- function(data, target) { # Standard audit base_report <- leakr_audit(data, target = target) # Domain-specific checks issues <- list() # Check for post-purchase features post_purchase_patterns <- c("return", "refund", "satisfaction", "rating") feature_names <- names(data) for(pattern in post_purchase_patterns) { matching_features <- grep(pattern, feature_names, value = TRUE, ignore.case = TRUE) if(length(matching_features) > 0) { issues <- append(issues, paste("Potential post-purchase feature:", paste(matching_features, collapse = ", "))) } } if(length(issues) > 0) { cat("Domain-specific warnings:\n") for(issue in issues) { cat("-", issue, "\n") } } return(base_report) } # Example e-commerce data ecommerce_data <- data.frame( customer_id = 1:100, purchase_amount = rlnorm(100, 4, 1), product_category = sample(c("electronics", "books", "clothing"), 100, replace = TRUE), customer_satisfaction = sample(1:5, 100, replace = TRUE), # Post-purchase! will_repurchase = factor(sample(c("yes", "no"), 100, replace = TRUE)) ) # Validate e-commerce data ecommerce_report <- ecommerce_validation(ecommerce_data, "will_repurchase")