🧪 Clinical utility comparison

Author

Maksim Ohvrill

Published

April 28, 2025

Further comparison of utility of CNN at default and optimized thresholds vs. Inferno, this workflow runs in R with estimations of CNNs predictions using a function that replicates sigmoid

Code

# ======================================================================================================================
# 📁 Setup: Root Directory, Paths, Parallelism, and Data
# ======================================================================================================================

library(inferno)

# Number of cores to use
parallel <- 10

# Root directory for all data
rootdir <- "../data/inferno"

# Directory with trained Inferno models
learntdir <- file.path(rootdir, "combinedML50")

# Load metadata
metadata <- read.csv(file.path(learntdir, "metadata.csv"))
print(paste("Loaded metadata with", nrow(metadata), "entries"))

# Load test data
testdata <- read.csv(file.path(rootdir, "calibration_test.csv"))[, metadata$name]
print(paste("Loaded test data with", nrow(testdata), "samples and", ncol(testdata), "features"))

[1] "Loaded metadata with 7 entries"
[1] "Loaded test data with 1459 samples and 7 features"

🎯 Single Prediction and Decision Maximizing Medical Utility

Code

# ----------------------------------------------------------------------------------------------------------------------
# 🎯 Single Prediction and Decision
# ----------------------------------------------------------------------------------------------------------------------

# Define predictands (outcome variables) and predictors (input features)
predictands <- c("LABEL_EFFUSION", "LABEL_ATELECTASIS")
predictors <- setdiff(metadata$name, predictands)

# Define possible outcomes
y <- setNames(expand.grid(0:1, 0:1), as.list(predictands))
outcomenames <- apply(y, 1, function(x) paste0("E", x[1], "_A", x[2]))

# Define utility (or cost-benefit) matrix
ematrix <- matrix(
  c(
    1.00, 0.55, 0.60, 0.40,
    0.90, 0.60, 0.65, 0.75,
    0.90, 0.65, 0.60, 0.75,
    0.80, 0.85, 0.85, 0.60
  ),
  nrow = 4,
  byrow = TRUE
)
colnames(ematrix) <- outcomenames
rownames(ematrix) <- outcomenames

# Patient index to evaluate
patient_idx <- 100

# Extract patient predictors
x_patient <- testdata[patient_idx, predictors, drop = FALSE]

# Extract true labels
true_labels <- testdata[patient_idx, predictands, drop = FALSE]
true_outcome_name <- paste0("E", true_labels[[1]], "_A", true_labels[[2]])

# Predict outcome probabilities
probs <- Pr(
  Y = y,
  X = x_patient,
  learnt = learntdir,
  parallel = parallel
)

# Calculate expected utilities
exputilities <- ematrix %*% probs$values

# Decision function: choose option maximizing expected utility
choosemax <- function(x) {
  sample(rep(which(x == max(x)), 2), 1)
}

# Make decision
decision <- choosemax(exputilities)
# ----------------------------------------------------------------------------------------------------------------------
# 📋 Detailed Report for Single Prediction
# ----------------------------------------------------------------------------------------------------------------------

cat("\n================ Single Prediction Report ================\n")
cat("Patient Index:", patient_idx, "\n\n")

cat("Input predictors for this patient:\n")
print(x_patient)

cat("\nPredicted probabilities for outcomes:\n")
predictions_table <- data.frame(
  Outcome = outcomenames,
  Probability = round(as.numeric(probs$values), 4)
)
print(predictions_table)

cat("\nExpected utilities for each decision:\n")
utility_table <- data.frame(
  Outcome = outcomenames,
  ExpectedUtility = round(as.numeric(exputilities), 4)
)
print(utility_table)

cat("\nTrue labels (ground truth):\n")
print(true_labels)
cat("True outcome name:", true_outcome_name, "\n")

cat("\n✅ Best guess (Maximizing Expected Utility):", outcomenames[decision], "\n")
cat("==========================================================\n")


Registered doParallelSNOW with 10 workers

Closing connections to cores.

================ Single Prediction Report ================
Patient Index: 100 

Input predictors for this patient:
    AGE GENDER VP LOGIT_EFFUSION LOGIT_ATELECTASIS
100  27      M PA       1.267194         -1.452902

Predicted probabilities for outcomes:
  Outcome Probability
1   E0_A0      0.1766
2   E1_A0      0.6835
3   E0_A1      0.0362
4   E1_A1      0.1036

Expected utilities for each decision:
  Outcome ExpectedUtility
1   E0_A0          0.6158
2   E1_A0          0.6703
3   E0_A1          0.7027
4   E1_A1          0.8153

True labels (ground truth):
    LABEL_EFFUSION LABEL_ATELECTASIS
100              1                 1
True outcome name: E1_A1 

✅ Best guess (Maximizing Expected Utility): E1_A1 
==========================================================

📋 Infeno Evaluation on Full Dataset

Predictions are made for all patients using the Inferno model.
Expected utilities are calculated using the clinical utility matrix.
Optimal decisions are chosen to maximize expected utility for each patient.
True labels are mapped and checked for consistency.
Model performance is evaluated based on:
- Clinical utility matrix (reflecting medical priorities).
- Bare diagonal matrix (pure classification accuracy).
- Baseline accuracy (always guessing the most common outcome).
Results show how the model outperforms naive guessing and how clinical priorities affect evaluation.

Code

# ----------------------------------------------------------------------------------------------------------------------
# 📊 Full Dataset Evaluation Maximizing Medical Utility
# ----------------------------------------------------------------------------------------------------------------------

# Full predictors and true labels
X <- testdata[, predictors, drop = FALSE]
trueY <- testdata[, predictands, drop = FALSE]

# Predict outcome probabilities for full dataset
probs_full <- Pr(
  Y = y,
  X = X,
  learnt = learntdir,
  parallel = parallel,
  quantiles = c(0.055, 0.945),
  nsamples = NULL
)

# Calculate expected utilities for full dataset
exputilities_full <- ematrix %*% probs_full$values

# Make decisions for full dataset
decisions_full <- apply(exputilities_full, 2, choosemax)

# Map true labels to indices
truevalues <- apply(trueY, 1, function(x) (x[1] + 2 * x[2]) + 1)

# Map true labels to outcome names
trueoutcomenames <- apply(trueY, 1, function(x) paste0("E", x[1], "_A", x[2]))

# Calculate baseline and model performance
most_common_value <- which.max(table(truevalues))
baseline_accuracy <- sum(truevalues == most_common_value) / length(truevalues)
avgyield <- mean(ematrix[cbind(decisions_full, truevalues)])

# Create bare identity matrix as utility matrix (perfect classification only)
ematrix_diag <- diag(4)

# Calculate expected utilities and decisions with bare diagonal matrix
exputilities_diag <- ematrix_diag %*% probs_full$values
decisions_diag <- apply(exputilities_diag, 2, choosemax)
avgyield_diag <- mean(ematrix_diag[cbind(decisions_diag, truevalues)])


Registered doParallelSNOW with 10 workers

Closing connections to cores.

Code

# ----------------------------------------------------------------------------------------------------------------------
# 📋 Printout of Evaluation Results
# ----------------------------------------------------------------------------------------------------------------------

cat("\nTrue outcome distribution (%):\n")
print(round(table(truevalues) / sum(table(truevalues)) * 100, 2))

cat("\nName consistency check:", all(trueoutcomenames == outcomenames[truevalues]), "\n")

cat("\n🚀 Inferno expected utility (accuracy with clinical utility matrix):", round(avgyield, 4), "\n")

cat("\n🧪 Expected utility (accuracy with bare diagonal utility matrix):", round(avgyield_diag, 4), "\n")

cat("\n🎯 Baseline accuracy (predicting most common outcome):", round(baseline_accuracy * 100, 1), "%\n")


True outcome distribution (%):
truevalues
    1     2     3     4 
46.74 23.51 22.62  7.13 

Name consistency check: TRUE 

🚀 Inferno expected utility (accuracy with clinical utility matrix): 0.9182 

🧪 Expected utility (accuracy with bare diagonal utility matrix): 0.645 

🎯 Baseline accuracy (predicting most common outcome): 46.7 %

🤖 Comparison with Neural Net Decisions at Thresholds 0.5 and 0.28

Threshold 0.5:
- This corresponds to the usual sigmoid threshold where a probability ≥ 0.5 is interpreted as a positive prediction.
- It is a standard classification rule but does not account for clinical consequences.
Threshold 0.28:
- This is a lower sigmoid threshold, equivalent to a logit cutoff at qlogis(0.28).
- It was chosen to maximize clinical utility, aligning better with the clinical utility matrix (ematrix).
- Lowering the threshold can help capture more true positives when missing them is costlier (important in medicine).
Purpose of comparison:
- To assess how the standard neural network threshold compares to a clinically optimized threshold, using the expected utility based on real clinical priorities rather than pure classification accuracy.

Code

# ----------------------------------------------------------------------------------------------------------------------
# 🤖 Comparison with Neural Net Decisions at Thresholds 0.5 and 0.28
# ----------------------------------------------------------------------------------------------------------------------

# --- Neural Net decisions at sigmoid threshold 0.5 ---
responsesNN_05 <- apply(
  testdata[, c("LOGIT_EFFUSION", "LOGIT_ATELECTASIS")],
  1,
  function(x) 1 * (x >= 0)
)

decisionsNN_05 <- apply(responsesNN_05, 2, function(x) {
  (x[1] + 2 * x[2]) + 1
})

responsenames_05 <- apply(responsesNN_05, 2, function(x) paste0("E", x[1], "_A", x[2]))

cat("\nNN decision naming check (threshold 0.5):",
    all(responsenames_05 == outcomenames[decisionsNN_05]), "\n")

avgyieldNN_05 <- mean(ematrix[cbind(decisionsNN_05, truevalues)])
cat("NN expected utility (threshold 0.5):",
    round(avgyieldNN_05, 4), "\n")

# --- Neural Net decisions at sigmoid threshold 0.28 ---
logit_threshold_028 <- qlogis(0.28)

responsesNN_028 <- apply(
  testdata[, c("LOGIT_EFFUSION", "LOGIT_ATELECTASIS")],
  1,
  function(x) 1 * (x >= logit_threshold_028)
)

decisionsNN_028 <- apply(responsesNN_028, 2, function(x) {
  (x[1] + 2 * x[2]) + 1
})

responsenames_028 <- apply(responsesNN_028, 2, function(x) paste0("E", x[1], "_A", x[2]))

cat("\nNN decision naming check (threshold 0.28):",
    all(responsenames_028 == outcomenames[decisionsNN_028]), "\n")

avgyieldNN_028 <- mean(ematrix[cbind(decisionsNN_028, truevalues)])
cat("NN expected utility (threshold 0.28):",
    round(avgyieldNN_028, 4), "\n")


NN decision naming check (threshold 0.5): TRUE 
NN expected utility (threshold 0.5): 0.8839 

NN decision naming check (threshold 0.28): TRUE 
NN expected utility (threshold 0.28): 0.9077