Model Selection

Author

Philipp Sven Lars Schäfer

Published

November 28, 2024

1 Packages

Code

suppressPackageStartupMessages({
  library(tidyverse)
  library(flextable)
  library(ggdark)
  library(magick)
  library(glmnet)
  library(ranger)
  source(file.path("..", "src", "read_data.R"))
  source(file.path("..", "src", "colors.R"))
  source(file.path("..", "src", "generate_targets.R"))
  source(file.path("..", "src", "model.R"))
  source(file.path("..", "src", "normalize_integrate.R"))
})

2 Data

Code

input_dir = file.path("..", "data")
output_dir = file.path("..", "results", "model_selection_integrated")
dir.create(output_dir, showWarnings = FALSE, recursive = TRUE)

Code

celltype_meta <- read_celltype_meta(input_dir)
gene_meta <- read_gene_meta_plus(input_dir)
protein_meta <- read_protein_meta(input_dir)

meta_data <- read_harmonized_meta_data(input_dir)
specimen_per_day <- get_specimen_per_day(meta_data=meta_data)

RECOMPUTE <- TRUE
if (RECOMPUTE) {
  raw_experimental_data <- read_raw_experimental_data(input_dir)
  
  filtered_experimental_data <- filter_experimental_data(
    meta_data=meta_data, 
    experimental_data=raw_experimental_data,
    gene_meta=gene_meta)
  
  write_rds(filtered_experimental_data, 
            file = file.path(input_dir, "prc_datasets", 
                             "filtered_experimental_data.RDS"))
  
  specimen_to_assay <- purrr::map(list("raw"=raw_experimental_data, "filtered"=filtered_experimental_data), function(exp_data) {
    # exp_data <- filtered_experimental_data; exp_name = "filtered"
    purrr::imap(exp_data, function(exp_df, exp_name) {
      exp_df %>%
        dplyr::select(specimen_id) %>%
        dplyr::distinct() %>%
        dplyr::mutate(assay = exp_name, specimen_id = as.numeric(specimen_id))
    }) %>%
      dplyr::bind_rows() %>%
      dplyr::mutate(present = TRUE) %>%
      tidyr::pivot_wider(names_from=assay, values_from=present) %>%
      mutate_all(~ ifelse(is.na(.), FALSE, .))
  })
  
  meta_data_plus_assays <- purrr::map(specimen_to_assay, function(df) {
    meta_data %>%
      dplyr::left_join(df, by="specimen_id") %>%
      mutate_all(~ ifelse(is.na(.), FALSE, .))
  })
  
  write_rds(specimen_to_assay, 
            file = file.path(input_dir, "prc_datasets", 
                             "specimen_to_assay.RDS"))
  write_rds(meta_data_plus_assays, 
            file = file.path(input_dir, "prc_datasets", 
                             "meta_data_plus_assays.RDS"))
  
  normalized_experimental_data <- normalize_experimental_data(
    meta_data=meta_data, 
    raw_experimental_data=filtered_experimental_data,
    gene_meta=gene_meta)
  
  write_rds(normalized_experimental_data, 
            file = file.path(input_dir, "prc_datasets", 
                             "normalized_experimental_data.RDS"))
  
  integrated_experimental_data <- integrate_experimental_data(
    meta_data=meta_data, 
    normalized_experimental_data=normalized_experimental_data)
  
  write_rds(integrated_experimental_data, 
            file = file.path(input_dir, "prc_datasets", 
                             "integrated_experimental_data.RDS"))

  # use raw/filtered experimental data for computation of targets
  target_list <- generate_all_targets(
    meta_data=meta_data, 
    experimental_data=integrated_experimental_data, 
    experimental_data_settings=experimental_data_settings, 
    gene_meta=gene_meta,
    protein_meta=protein_meta
    )
  
  # use raw/filtered experimental data for computation of targets
  baseline_list <- generate_all_baselines(
    meta_data=meta_data, 
    experimental_data=integrated_experimental_data, 
    experimental_data_settings=experimental_data_settings, 
    gene_meta=gene_meta,
    protein_meta=protein_meta
    )
  
  stopifnot(identical(names(target_list), names(baseline_list)))
  purrr::map_lgl(names(target_list), function(task_name) {
    (target_list[[task_name]] %>%
      dplyr::inner_join(baseline_list[[task_name]], by="subject_id") %>%
      dplyr::summarise(cor = cor(baseline.x, baseline.y)) %>%
      dplyr::pull(cor)) == 1
  }) %>%
    all() %>%
    stopifnot()
  
  write_rds(target_list, file = file.path(input_dir, "prc_datasets", 
                                          "target_list.RDS"))
  
  write_rds(baseline_list, file = file.path(input_dir, "prc_datasets", 
                                            "baseline_list.RDS"))
  
  rm(raw_experimental_data, filtered_experimental_data)
  experimental_data <- integrated_experimental_data
  experimental_data <- experimental_data[-which(names(experimental_data) ==
                                                "pbmc_gene_expression_counts")]
} else {
  experimental_data <- read_rds(file = file.path(input_dir, "prc_datasets", 
                                                            "integrated_experimental_data.RDS"))
  experimental_data <- experimental_data[-which(names(experimental_data) ==
                                                "pbmc_gene_expression_counts")]
  
  target_list <- read_rds(file = file.path(input_dir, "prc_datasets",  "target_list.RDS"))
  
  baseline_list <- read_rds(file = file.path(input_dir, "prc_datasets", "baseline_list.RDS"))
  
  specimen_to_assay <- read_rds(file = file.path(input_dir, "prc_datasets", "specimen_to_assay.RDS"))
  
  meta_data_plus_assays <- read_rds(file = file.path(input_dir, "prc_datasets", "meta_data_plus_assays.RDS"))
}

pbmc_cell_frequency | Removed 550 specimens because missing in meta data

plasma_ab_titer | Removed 6931 specimens because missing in meta data

plasma_cytokine_concentration_by_olink | Removed 495 specimens because missing in meta data

pbmc_cell_frequency | Removed 56 features because not in feature subset

plasma_ab_titer | Removed 48 features because not in feature subset

plasma_cytokine_concentration_by_olink | Removed 234 features because not in feature subset

t_cell_polarization | Removed 3 features because not in feature subset

plasma_cytokine_concentration_by_olink | Removed 300 features because qc warning

plasma_ab_titer | Removed 10540 measurements because wrong unit used

plasma_cytokine_concentration_by_olink | Removed 2400 measurements because wrong unit used

plasma_cytokine_concentration_by_legendplex | Removed 8 because specimen is outlier

plasma_cytokine_concentration_by_olink | Removed specimen 750, 760, 824, 833, 894, 903 because fraction of measurements below LOQ > 50%

plasma_ab_titer | Removed specimen 674, 675, 676 because fraction of measurements below LOD > 50%

converting counts to integer mode

Found 4 batches
Using null model in ComBat-seq.
Adjusting for 0 covariate(s) or covariate level(s)
Estimating dispersions
Fitting the GLM model
Shrinkage off - using GLM estimates for parameters
Adjusting the data

Found3batches

Adjusting for0covariate(s) or covariate level(s)

Standardizing Data across genes

Fitting L/S model and finding priors

Finding parametric adjustments

Adjusting the Data

3 Tasks

Code

task_meta <- list(
  task_11 = list(
    name = "task_11",
    header = "## Task 1.1",
    description = "Rank the individuals by IgG antibody levels against pertussis toxin (PT) that we detect in plasma 14 days post booster vaccinations."
  ),
  task_12 = list(
    name = "task_12",
    header = "## Task 1.2",
    description = "Rank the individuals by fold change of IgG antibody levels against pertussis toxin (PT) that we detect in plasma 14 days post booster vaccinations compared to titer values at day 0."
  ),
  task_21 = list(
    name = "task_21",
    header = "## Task 2.1",
    description = "Rank the individuals by predicted frequency of Monocytes on day 1 post boost after vaccination."
  ),
  task_22 = list(
    name = "task_22",
    header = "## Task 2.2",
    description = "Rank the individuals by fold change of predicted frequency of Monocytes on day 1 post booster vaccination compared to cell frequency values at day 0."
  ),
  task_31 = list(
    name = "task_31",
    header = "## Task 3.1",
    description = "Rank the individuals by predicted gene expression of CCL3 on day 3 post-booster vaccination."
  ),
  task_32 = list(
    name = "task_32",
    header = "## Task 3.2",
    description = "Rank the individuals by fold change of predicted gene expression of CCL3 on day 3 post booster vaccination compared to gene expression values at day 0."
  ),
  task_41 = list(
    name = "task_41",
    header = "## Task 4.1",
    description = "Rank the individuals based on their Th1/Th2 (IFN-g/IL-5) polarization ratio on Day 30 post-booster vaccination."
  )
)

4 Rationale

How to choose the right model?
- Top ranking performance
- Low variance between the test sets
- If several models with top performance and low variance, choose the more regularized model
- If several models with top performance and low variance, choose model that is not using assay that is missing often in test data (-> Olink!)

5 Conclusions

TODO

6 Prepare Features

Code

# get the 2000 most variable genes based on the mean dispersion (variance/mean) across cohorts
N_HVG <- 2000
hvg <- gene_meta %>% 
  dplyr::mutate("prefix_versioned_ensembl_gene_id_clean" =
                  paste0("pbmc_gene_expression_", versioned_ensembl_gene_id_clean)) %>%
  dplyr::slice_max(mean_rank, n=N_HVG, with_ties=TRUE)

meta_data_cov <- get_metadata_covariates(meta_data) %>%
  dplyr::select(-c(ethnicity_Not_Hispanic_or_Latino, ethnicity_Hispanic_or_Latino))

experimental_predictors_day_0 <- 
  generate_wide_experimental_data(experimental_data=experimental_data,
                                  impute="knn", 
                                  max_NA_frac_sample = 0.5,
                                  max_NA_frac_feature = 0.25,
                                  verbose=TRUE) %>%
  purrr::imap(function(df, modality) {
    colnames(df) <- paste0(modality, "_", colnames(df)) # since feature names can be repeated across assays
    df <- as.data.frame(df) %>%
      tibble::rownames_to_column("specimen_id") %>%
      dplyr::mutate(specimen_id = as.numeric(specimen_id)) %>%
      dplyr::inner_join((specimen_per_day$day_0 %>% 
                           dplyr::select(subject_id, specimen_id, dataset)),
                        by="specimen_id") %>%
      dplyr::select(-c(specimen_id, dataset)) %>%
      dplyr::select(subject_id, everything())
    return(df)
  })

plasma_cytokine_concentration_by_olink | NA Fraction: 0.00288417166589755 | Imputed with knn imputation

t_cell_activation | NA Fraction: 0.0576923076923077 | Removed samples: 681

t_cell_activation | NA Fraction: 0.0553691275167785 | Imputed with knn imputation

t_cell_polarization | NA Fraction: 0.0134099616858238 | Imputed with knn imputation

Code

experimental_predictors_day_0$pbmc_gene_expression <-
  experimental_predictors_day_0$pbmc_gene_expression[, c("subject_id", hvg$prefix_versioned_ensembl_gene_id_clean)]

#purrr::map(experimental_predictors_day_0, ~ dim(.x))

experimental_predictors_day_0_pca <- experimental_predictors_day_0 %>%
  purrr::imap(function(df, modality) {
    # df <- experimental_predictors_day_0$
    mtx <- df %>% dplyr::select(-subject_id) %>% as.matrix()
    pca <- stats::prcomp(mtx)
    sdev_ratio <- cumsum(pca$sdev) / sum(pca$sdev)
    n_pcs <- max(which(sdev_ratio < 0.9))
    df_pca <- as.data.frame(pca$x[, 1:n_pcs]) %>%
      dplyr::mutate(subject_id = df$subject_id) %>%
      dplyr::select(subject_id, everything())
    return(df_pca)
})

modality_powerset <- purrr::map(1:length(experimental_predictors_day_0), function(i) {
  utils::combn(names(experimental_predictors_day_0), i, simplify = FALSE)
}) %>%
   unlist(recursive = FALSE)

#purrr::map(experimental_predictors_day_0_pca, ~ dim(.x))

meta_data_plus_assays_day_0 <- meta_data_plus_assays$filtered %>%
  dplyr::filter(specimen_id %in% specimen_per_day$day_0$specimen_id)
stopifnot(length(meta_data_plus_assays_day_0$subject_id) == 
            length(unique(meta_data_plus_assays_day_0$subject_id)))

Check which assays are missing the least often?

Olink is missing quite often, because I removed all measurements for the 2020 cohort

Code

# which assays are most/least often missing?
meta_data_plus_assays_day_0 %>%
  dplyr::select(dplyr::all_of(names(experimental_predictors_day_0))) %>%
  dplyr::mutate(row_sum = apply(select(., matches(".+")), 1, sum)) %>%
  dplyr::filter(row_sum > 0) %>%
  tidyr::pivot_longer(cols=!row_sum) %>%
  dplyr::group_by(name) %>%
  dplyr::summarise(mean = mean(value)) %>%
  dplyr::arrange(desc(mean))

Code

MAX_N_ASSAYS_MISSING <- 2
K_FOR_KNN_IMPUTE <- 7

feature_list <- list(
  "metadata" = meta_data_cov,
  "metadata+baseline" = meta_data_cov # will take care of that logic in loop
)

# check for each feature setting (aka modality setting)
# whether to many assays are missing for any subject 
# (by default we remove subject if more than 2 of the required feature sets are missing)
# usually we exclude here subject from the 2020 dataset
for (modality_set in modality_powerset) {
  # modality_set <- modality_powerset[[100]]
  modality_set_name <- paste0(paste0(modality_set, collapse = "_pca+"), "_pca+metadata+baseline")
  
  # inner join here, other subject have no assays for day 0...
  modality_set_check_assays  <- meta_data_cov %>%
    dplyr::inner_join(dplyr::select(meta_data_plus_assays_day_0, dplyr::all_of(c("subject_id", modality_set))), 
                        by="subject_id")
  modality_set_check_assays[, modality_set][is.na(modality_set_check_assays[, modality_set])] <- FALSE
  modality_set_check_assays[, "n_assays_present"] <- rowSums(modality_set_check_assays[, modality_set])
  modality_set_check_assays[, "n_assays_missing"] <- length(modality_set) - modality_set_check_assays[, "n_assays_present"]
  #modality_set_check_assays[, "n_assays_max"] <- length(modality_set)
  #modality_set_check_assays <- modality_set_check_assays %>% dplyr::mutate(n_assays_frac = n_assays_present / n_assays_max)
  
  subjects_to_exclude <- modality_set_check_assays %>%
    dplyr::filter(n_assays_missing >= !!MAX_N_ASSAYS_MISSING) %>%
    dplyr::pull(subject_id)
  
  modality_set_meta_df <- meta_data_cov %>%
    dplyr::filter(!(subject_id %in% subjects_to_exclude))
  
  modality_set_df <- 
    experimental_predictors_day_0_pca[modality_set] %>%
    purrr::reduce(full_join, by="subject_id") %>%
    dplyr::inner_join(modality_set_meta_df, by="subject_id")
  
  modality_set_mtx <- modality_set_df %>%
    tibble::column_to_rownames("subject_id") %>%
    as.matrix() %>%
    t() %>%
    impute::impute.knn(data=., k=K_FOR_KNN_IMPUTE, maxp=10000, colmax=1) %>%
    .$data %>%
    t()
  
  feature_list[[modality_set_name]] <- as.data.frame(modality_set_mtx) %>%
    tibble::rownames_to_column("subject_id") %>%
    dplyr::mutate(subject_id = as.numeric(subject_id))
}

# adding power set of single-omic predictors
for (modality_set in modality_powerset) {
  # modality_set <- modality_powerset[[100]]
  modality_set_name <- paste0(paste0(modality_set, collapse = "+"), "+metadata+baseline")
  
# inner join here, other subject have no assays for day 0...
  modality_set_check_assays  <- meta_data_cov %>%
    dplyr::inner_join(dplyr::select(meta_data_plus_assays_day_0, dplyr::all_of(c("subject_id", modality_set))), 
                        by="subject_id")
  modality_set_check_assays[, modality_set][is.na(modality_set_check_assays[, modality_set])] <- FALSE
  modality_set_check_assays[, "n_assays_present"] <- rowSums(modality_set_check_assays[, modality_set])
  modality_set_check_assays[, "n_assays_missing"] <- length(modality_set) - modality_set_check_assays[, "n_assays_present"]
  #modality_set_check_assays[, "n_assays_max"] <- length(modality_set)
  #modality_set_check_assays <- modality_set_check_assays %>% dplyr::mutate(n_assays_frac = n_assays_present / n_assays_max)
  
  
  subjects_to_exclude <- modality_set_check_assays %>%
    dplyr::filter(n_assays_missing >= !!MAX_N_ASSAYS_MISSING) %>%
    dplyr::pull(subject_id)
  
  modality_set_meta_df <- meta_data_cov %>%
    dplyr::filter(!(subject_id %in% subjects_to_exclude))
  
  modality_set_df <- 
    experimental_predictors_day_0[modality_set] %>%
    purrr::reduce(full_join, by="subject_id") %>%
    dplyr::inner_join(modality_set_meta_df, by="subject_id")
  
  modality_set_mtx <- modality_set_df %>%
    tibble::column_to_rownames("subject_id") %>%
    as.matrix() %>%
    t() %>%
    impute::impute.knn(data=., k=K_FOR_KNN_IMPUTE, maxp=10000, colmax=1) %>%
    .$data %>%
    t()
  
  feature_list[[modality_set_name]] <- as.data.frame(modality_set_mtx) %>%
    tibble::rownames_to_column("subject_id") %>%
    dplyr::mutate(subject_id = as.numeric(subject_id))
}

stopifnot(all(purrr::map_dbl(feature_list, ~ mean(is.na(.x))) == 0))

#purrr::map(feature_list, ~ dim(.x))

7 Model Evaluation / Assessment

Code

set.seed(42)

tictoc::tic()

result_list <-  list()

for (task in task_meta) {
  # task <- task_meta[[1]]
  
  task_df <- target_list[[task$name]]
  
  target_df <- task_df %>%
    dplyr::select(subject_id, target)
  
  baseline_df <- task_df %>% 
    dplyr::select(subject_id, baseline)
  
  baseline_model_df <- task_df %>%
    dplyr::left_join(dplyr::distinct(dplyr::select(meta_data, c(subject_id, dataset))),
                     by="subject_id")
  
  all_datasets <- unique(baseline_model_df$dataset)
  
  for (test_dataset in all_datasets) {
    result_list <- rlist::list.append(result_list, tibble::tibble(
      task = task$name,
      trainset = NA,
      testset = test_dataset,
      test_subset = task_df$subject_id[baseline_model_df$dataset==test_dataset],
      target = task_df$target[baseline_model_df$dataset==test_dataset], 
      prediction = task_df$baseline[baseline_model_df$dataset==test_dataset],
      testset_mean = NA,
      model = "baseline",
      feature_set = "baseline"
    ))
  }
  # for (model_name in c("lasso", "elnet", "rf", "rf+boruta")) {
  for (model_name in c("lasso", "elnet", "rf")) {
    # model_name <- "lasso"
    for (feature_name in names(feature_list)) {
      # feature_name <- names(feature_list)[2]
      feature_df <- feature_list[[feature_name]]
      
      if (str_detect(feature_name, "baseline")) {
        feature_df <- feature_df %>%
          dplyr::left_join(baseline_df, by="subject_id")
      }
      
      model_df <- target_df %>%
        dplyr::left_join(feature_df, by="subject_id") %>%
        dplyr::left_join(dplyr::distinct(dplyr::select(meta_data, c(subject_id, dataset))),
                         by="subject_id")
      
      #stopifnot(!any(is.na(model_df))) # guard rail
      model_df <- model_df %>% tidyr::drop_na()
      stopifnot(nrow(model_df)==nrow(dplyr::distinct(model_df)))
      
      all_datasets <- unique(model_df$dataset)
      
      cross_dataset_out <- purrr::map(all_datasets, function(test_dataset) {
        # test_dataset <- all_datasets[2]
        train_dataset <- all_datasets[!all_datasets %in% test_dataset]
        
        train_df <- model_df %>%
          dplyr::filter(dataset %in% train_dataset) %>%
          dplyr::select(-c(dataset, subject_id))
        
        train_mtx <- as.matrix(dplyr::select(train_df, -target))
        train_target <- dplyr::pull(train_df, target)
        
        test_df_with_subject <- model_df %>%
          dplyr::filter(dataset %in% test_dataset)
        
        test_df <- test_df_with_subject %>%
          dplyr::select(-c(dataset, subject_id))
        
        test_mtx <- as.matrix(dplyr::select(test_df, -target))
        test_target <- dplyr::pull(test_df, target)
        
        if (model_name == "rf") {
          model <- ranger::ranger(formula = target ~ ., 
                                  data = train_df, 
                                  num.trees = 500)
          predictions <- predict(model, test_df)$predictions
        } 
        else if (model_name == "rf+boruta") {
          if (ncol(train_df) > 2) {
            boruta_out <- Boruta::Boruta(formula = target ~ .,
                                         data = train_df,
                                         maxRuns=1000,
                                         num.trees=500 
                                         )
            boruta_feats <- names(boruta_out$finalDecision)[boruta_out$finalDecision=="Confirmed"]
          } else {
            boruta_feats <- colnames(train_df)[colnames(train_df)!="target"]
          }
          if (length(boruta_feats) == 0) {
            predictions <- rep(mean(train_df$target), nrow(test_df))
          } else {
            train_df_subset <- train_df %>% dplyr::select(dplyr::all_of(c(boruta_feats, "target")))
            test_df_subset <- test_df %>% dplyr::select(dplyr::all_of(c(boruta_feats, "target")))
            model <- ranger::ranger(formula = target ~ ., 
                                    data = train_df_subset, 
                                    num.trees = 500)
            predictions <- predict(model, test_df_subset)$predictions
          }
        } 
        else if (model_name == "lasso") {
          # standardize the features, even though glmnet is doing this automatically!
          # feature_means <- matrixStats::colMeans2(train_mtx)
          # feature_sds <- matrixStats::colSds(train_mtx)
          # train_mtx <- t((t(train_mtx) - feature_means) / feature_sds)
          # train_target <- (train_target - mean(train_target)) / sd(train_target)
          #test_mtx <- t((t(test_mtx) - feature_means) / feature_sds)
          
          # inner hyperparameter optim loop using LOOCV to find best lambda
          model <- glmnet::cv.glmnet(x=train_mtx,
                                     y=train_target,
                                     nfolds=nrow(train_mtx),
                                     grouped=FALSE,
                                     alpha=1, # LASSO
                                     type.measure="mae", # not sure whether this is better
                                     standardize=TRUE, # by default each feature is standardized internally
                                     family="gaussian"
                                     )
          # investigate
          #beta_mtx <- as.data.frame(as.matrix(model$glmnet.fit$beta))
          #colnames(beta_mtx) <- paste0("lambda_", round(model$lambda, 3))
          #plot(model)
          #lambda_value <- model$lambda.1se, # model$lambda.min
          # never use zero features (with lamba=model$lambda[1]), since constant predictions
          # will have NA spearman...
          lambda_value <- ifelse(model$lambda.min==model$lambda[1], 
                                 model$lambda[2], model$lambda.min)
          predictions <- predict(model,
                                 newx=test_mtx, 
                                 s=lambda_value)[,1]
        }
        else if (model_name == "elnet") {
          # standardize the features, even though glmnet is doing this automatically!
          # feature_means <- matrixStats::colMeans2(train_mtx)
          # feature_sds <- matrixStats::colSds(train_mtx)
          # train_mtx <- t((t(train_mtx) - feature_means) / feature_sds)
          # train_target <- (train_target - mean(train_target)) / sd(train_target)
          #test_mtx <- t((t(test_mtx) - feature_means) / feature_sds)
          
          # inner hyperparameter optim loop using LOOCV to find best lambda
          model <- glmnet::cv.glmnet(x=train_mtx,
                                     y=train_target,
                                     nfolds=nrow(train_mtx),
                                     grouped=FALSE,
                                     alpha=0.5, # elastic net
                                     type.measure="mae", # not sure whether this is better
                                     standardize=TRUE, # by default each feature is standardized internally
                                     family="gaussian"
                                     )
          lambda_value <- ifelse(model$lambda.min==model$lambda[1], 
                                 model$lambda[2], model$lambda.min)
          predictions <- predict(model,
                                 newx=test_mtx, 
                                 s=lambda_value)[,1]
        }
        else {
          stop(paste0(model_name), " not implemented")
        }
        
        tibble::tibble(task=task$name,
                       trainset = paste0(train_dataset, collapse="__"),
                       testset = test_dataset,
                       test_subset = test_df_with_subject$subject_id,
                       target = test_df$target, 
                       prediction = predictions,
                       testset_mean = mean(test_df$target),
                       model = model_name,
                       feature_set = feature_name)
      }) %>%
        dplyr::bind_rows()
      result_list <- rlist::list.append(result_list, cross_dataset_out)
    }
  }
}

Warning: from glmnet C++ code (error code -99); Convergence for 99th lambda
value not reached after maxit=100000 iterations; solutions for larger lambdas
returned

Warning: from glmnet C++ code (error code -93); Convergence for 93th lambda
value not reached after maxit=100000 iterations; solutions for larger lambdas
returned

Warning: from glmnet C++ code (error code -98); Convergence for 98th lambda
value not reached after maxit=100000 iterations; solutions for larger lambdas
returned

Warning: from glmnet C++ code (error code -94); Convergence for 94th lambda
value not reached after maxit=100000 iterations; solutions for larger lambdas
returned

Warning: from glmnet C++ code (error code -91); Convergence for 91th lambda
value not reached after maxit=100000 iterations; solutions for larger lambdas
returned

Warning: from glmnet C++ code (error code -97); Convergence for 97th lambda
value not reached after maxit=100000 iterations; solutions for larger lambdas
returned

Warning: from glmnet C++ code (error code -91); Convergence for 91th lambda
value not reached after maxit=100000 iterations; solutions for larger lambdas
returned

Warning: from glmnet C++ code (error code -95); Convergence for 95th lambda
value not reached after maxit=100000 iterations; solutions for larger lambdas
returned

Code

result_df <- dplyr::bind_rows(result_list)
result_df

Code

tictoc::toc()

947.181 sec elapsed

Code

result_summary <- result_df %>%
    dplyr::group_by(task, feature_set, model, trainset, testset) %>%
    dplyr::summarise(
      test_mse = get_mse(target, prediction),
      test_r2 = get_r2(target, prediction),
      test_srho = get_spearman(target, prediction),
      mse_same_cohort = get_mse(target, testset_mean),
      .groups = "drop"
    )
result_summary

Why is that? This happens when the boruta algorithm returns 0 important features, which means in the test set the predictions are set to some constant. And since a constant vector has 0 standard deviation, we cannot compute the correlation (where we divide by the standard deviation).

Code

result_summary %>%
  dplyr::filter(is.na(test_srho)) %>%
  dplyr::count(task, model)

Code

model_selection_df <- result_summary %>%
  dplyr::select(task, feature_set, model, testset, test_srho) %>%
  tidyr::pivot_wider(values_from=test_srho, names_from=testset) %>%
  dplyr::mutate(srho_mean = apply(select(., matches("[0-9]+_dataset")), 1, mean, na.rm=TRUE)) %>%
  dplyr::filter(!is.na(srho_mean)) %>% # so we only keep entries where predictions in all test sets where constant
  dplyr::mutate(srho_min = apply(select(., matches("[0-9]+_dataset")), 1, min, na.rm=TRUE)) %>%
  dplyr::mutate(srho_sd = apply(select(., matches("[0-9]+_dataset")), 1, sd, na.rm=TRUE)) %>%
  dplyr::mutate(srho_mean = round(srho_mean, 3)) %>%
  dplyr::mutate(srho_min = round(srho_min, 3)) %>%
  dplyr::mutate(srho_sd = round(srho_sd, 3)) %>%
  dplyr::mutate(across(matches("[0-9]+_dataset"), ~ round(.x, 3))) %>%
  dplyr::mutate(dim_red = dplyr::case_when(
    str_detect(feature_set, "_pca") ~ "pca",
    TRUE ~ "none"
  )) %>%
  dplyr::mutate(n_modalities = (stringr::str_count(feature_set, "\\+") + 1)) %>%
  dplyr::select(task, model, srho_mean, srho_sd, srho_min, !feature_set, feature_set)

Code

readr::write_delim(model_selection_df, file=file.path(output_dir, paste0(format(Sys.Date(), "%Y-%m-%d"), "_model_selection_df.tsv")), delim="\t")

#model_selection_df <- readr::read_delim(file.path(output_dir, "model_selection_df.tsv"), delim="\t")

library(openxlsx)
# see https://ycphs.github.io/openxlsx/reference/openxlsx_options.html
#base::options()
wb <- openxlsx::createWorkbook()
openxlsx::addWorksheet(wb, "results_unfiltered")
openxlsx::setColWidths(wb, sheet="results_unfiltered", cols = 1:10, widths = 15)
openxlsx::setColWidths(wb, sheet="results_unfiltered", cols = 11:12, widths = 100)
openxlsx::writeDataTable(wb, 
                         sheet = "results_unfiltered", 
                         x = model_selection_df, 
                         startRow = 1, 
                         startCol = 1,
                         tableStyle = "TableStyleDark1")

for (task in unique(model_selection_df$task)) {
  # task <- "task_21"
  task_result_df <- model_selection_df %>%
    dplyr::filter(task == !!task) %>%
    dplyr::filter(!is.na(srho_sd)) %>%
    dplyr::arrange(desc(srho_mean)) 
  openxlsx::addWorksheet(wb, task)
  openxlsx::setColWidths(wb, sheet=task, cols = 1:10, widths = 15)
  openxlsx::setColWidths(wb, sheet=task, cols = 11:12, widths = 100)
  openxlsx::writeDataTable(wb, 
                         sheet = task,
                         x = task_result_df, 
                         startRow = 1, 
                         startCol = 1,
                         tableStyle = "TableStyleDark1")
}
openxlsx::saveWorkbook(
  wb, 
  file=file.path(output_dir, paste0(format(Sys.Date(), "%Y-%m-%d"), "_model_selection_df.xlsx")), 
  overwrite = TRUE)

8 Prediction for 2023

Load the submission template

Code

submission_df <- readr::read_tsv(file.path(input_dir, "3rdChallengeSubmissionTemplate_10032024.tsv"),
                                 show_col_types = FALSE)
submission_df

Check which assays are missing most often

Code

test_assays_available <- submission_df %>%
  dplyr::select(SubjectID) %>%
  tidylog::left_join(meta_data_plus_assays_day_0, by=c("SubjectID"="subject_id"))

left_join: added 21 columns (specimen_id, actual_day_relative_to_boost, planned_day_relative_to_boost, visit, infancy_vac, ...)
           > rows only in x                           0
           > rows only in meta_data_plus_assays_d.. (98)
           > matched rows                            54
           >                                        ====
           > rows total                              54

Code

colMeans(test_assays_available[, names(experimental_predictors_day_0)]) %>% sort()

     plasma_cytokine_concentration_by_olink 
                                  0.5925926 
                        pbmc_cell_frequency 
                                  0.8888889 
                        t_cell_polarization 
                                  0.9444444 
                          t_cell_activation 
                                  0.9629630 
                       pbmc_gene_expression 
                                  0.9814815 
                            plasma_ab_titer 
                                  1.0000000 
plasma_cytokine_concentration_by_legendplex 
                                  1.0000000

Code

rowSums(test_assays_available[, names(experimental_predictors_day_0)]) %>% sort()

 [1] 5 5 5 5 5 5 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 7 7 7 7 7 7 7 7 7 7
[39] 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7

Define for each task a list with the following entries

Code

colnames(submission_df)

 [1] "SubjectID"                           "Age"                                
 [3] "BiologicalSexAtBirth"                "VaccinePrimingStatus"               
 [5] "1.1) IgG-PT-D14-titer-Rank"          "1.2) IgG-PT-D14-FC-Rank"            
 [7] "2.1) Monocytes-D1-Rank"              "2.2) Monocytes-D1-FC-Rank"          
 [9] "3.1) CCL3-D3-Rank"                   "3.2) CCL3-D3-FC-Rank"               
[11] "4.1) IFNG/IL5-Polarization-D30-Rank"

Code

prediction_specs <- list(
  "task_11" = list(
    "model" = "baseline",
    "feature_set" = "baseline",
    "table_col" = "1.1) IgG-PT-D14-titer-Rank"
  ),
  "task_12" = list(
    "model" = "lasso",
    "feature_set" = "pbmc_cell_frequency+t_cell_polarization+metadata+baseline",
    "table_col" = "1.2) IgG-PT-D14-FC-Rank"
  ),
  "task_21" = list(
    "model" = "rf",
    "feature_set" = "pbmc_cell_frequency_pca+plasma_cytokine_concentration_by_legendplex_pca+t_cell_activation_pca+metadata+baseline",
    "table_col" = "2.1) Monocytes-D1-Rank"
  ),
  "task_22" = list(
    "model" = "elnet",
    "feature_set" = "plasma_cytokine_concentration_by_legendplex+t_cell_activation+t_cell_polarization+metadata+baseline",
    "table_col" = "2.2) Monocytes-D1-FC-Rank"
  ),
  "task_31" = list(
    "model" = "lasso",
    "feature_set" = "plasma_cytokine_concentration_by_legendplex+t_cell_polarization+metadata+baseline",
    "table_col" = "3.1) CCL3-D3-Rank"
  ),
  "task_32" = list(
    "model" = "rf",
    "feature_set" = "plasma_ab_titer_pca+metadata+baseline",
    "table_col" = "3.2) CCL3-D3-FC-Rank"
  ),
  "task_41" = list(
    "model" = "lasso",
    "feature_set" = "pbmc_cell_frequency+plasma_cytokine_concentration_by_legendplex+t_cell_activation+metadata+baseline",
    "table_col" = "4.1) IFNG/IL5-Polarization-D30-Rank"
  )
)

# checks
## 0) all task column names are in submission table
purrr::map_lgl(prediction_specs, ~ .x[["table_col"]] %in% colnames(submission_df)) %>%
  all() %>%
  stopifnot()

## a) all tasks are specified
stopifnot(dplyr::setequal(names(task_meta), names(prediction_specs)))

## b) all features are present
purrr::map_lgl(prediction_specs, ~ length(str_split(.x[["feature_set"]], "\\+")[[1]]) >= 1) %>%
  all() %>%
  stopifnot()

purrr::map_lgl(prediction_specs, ~ .x[["feature_set"]] %in% c(names(feature_list), "baseline")) %>%
  all() %>%
  stopifnot()

## c) which features are never selected?
selected_features <- purrr::map(prediction_specs, ~ str_split(.x[["feature_set"]], "\\+")[[1]]) %>%
  unlist() %>%
  unique()

all_features <- purrr::map(names(feature_list), ~ str_split(.x, "\\+")[[1]]) %>%
  unlist() %>%
  unique()

all_features[!all_features %in% selected_features]

[1] "pbmc_gene_expression_pca"                  
[2] "plasma_cytokine_concentration_by_olink_pca"
[3] "t_cell_polarization_pca"                   
[4] "pbmc_gene_expression"                      
[5] "plasma_ab_titer"                           
[6] "plasma_cytokine_concentration_by_olink"

Code

rm(selected_features, all_features)

## d) which features are most commonly used
purrr::map(prediction_specs, ~ str_split(.x[["feature_set"]], "\\+")[[1]]) %>%
  unlist() %>%
  stringr::str_remove_all(., "_pca$") %>%
  table() %>%
  sort(decreasing=TRUE)

.
                                   baseline 
                                          7 
                                   metadata 
                                          6 
plasma_cytokine_concentration_by_legendplex 
                                          4 
                        pbmc_cell_frequency 
                                          3 
                          t_cell_activation 
                                          3 
                        t_cell_polarization 
                                          3 
                            plasma_ab_titer 
                                          1

Code

## e) all models are implemented
purrr::map_lgl(prediction_specs, ~ .x[["model"]] %in% c("rf", "rf+boruta", "elnet", "lasso", "baseline")) %>%
  all() %>%
  stopifnot()

## f) pull scores from result df
purrr::imap(prediction_specs, function(task_specs, task) {
  entry <- model_selection_df %>%
    dplyr::filter(task == !!task) %>%
    dplyr::filter(model == !!task_specs[["model"]]) %>%
    dplyr::filter(feature_set == !!task_specs[["feature_set"]])
  stopifnot(nrow(entry)==1)
  return(entry)
}) %>%
  dplyr::bind_rows()

Make the predictions

Train model on full dataset
Make predictions for the 2023 data

Code

var_imp_list <- list()
predictions_list <- list()

for (task_name in names(prediction_specs)) {
  # task_name <- "task_12"
  task_specs <- prediction_specs[[task_name]]
  
  print(task_name)
  print(task_specs$model)
  print(task_specs$feature_set)
  
  task_df <- target_list[[task_name]]
    
  target_df <- task_df %>%
    dplyr::select(subject_id, target)
  
  baseline_df <- baseline_list[[task_name]]
  
  if (task_specs$feature_set=="baseline" & task_specs$model=="baseline") {
    feature_df <- baseline_df %>% dplyr::select(subject_id)
  } else {
    feature_df <- feature_list[[task_specs$feature_set]]
  }
  
  if (str_detect(task_specs$feature_set, "baseline")) {
    feature_df <- feature_df %>%
      dplyr::left_join(baseline_df, by="subject_id")
  }
  
  train_df <- target_df %>%
    tidylog::inner_join(feature_df, by="subject_id") %>%
    dplyr::left_join(dplyr::distinct(dplyr::select(meta_data, c(subject_id, dataset))),
                     by="subject_id") %>%
    dplyr::select(-c(dataset)) %>%
    tibble::column_to_rownames(var="subject_id")
  stopifnot(!any(is.na(train_df)))
  stopifnot(nrow(train_df)==nrow(dplyr::distinct(train_df)))
  # check that features were duplicated during joining of tables (i.e. features end with .x and .y) -> but doesn't make sense with the PCA joining...
  #stopifnot(all(purrr::map_dbl(stringr::str_split(colnames(train_df), "\\."), ~ length(.x)) == 1))
  #colnames(train_df)[purrr::map_dbl(stringr::str_split(colnames(train_df), "\\."), ~ length(.x)) > 1]
  
  test_df <- submission_df %>%
    dplyr::mutate(subject_id = SubjectID) %>% 
    dplyr::select(subject_id) %>%
    dplyr::left_join(feature_df, by="subject_id") %>%
    tibble::column_to_rownames(var="subject_id")
  
  # check if the baseline is missing for certain instances
  if (str_detect(task_specs$feature_set, "baseline")) {
    if (sum(is.na(test_df$baseline)) > 0) {
      message(paste0("Baseline values are missing for ", 
                   sum(is.na(test_df$baseline)),
                   " samples! Imputing with Median"))
      test_df$baseline[is.na(test_df$baseline)] <- median(test_df$baseline, na.rm=TRUE)
    }
  }
  
  # now there should be no more NAs!
  stopifnot(mean(is.na(test_df)) == 0)
  
  # some more checks
  stopifnot(identical(colnames(train_df)[colnames(train_df)!="target"], colnames(test_df)))
  #colnames(test_df)[!colnames(test_df) %in% colnames(train_df)]
  #colnames(train_df)[!colnames(train_df) %in% colnames(test_df)]
  
  # if we need matrix, vector objects for certain ML algos
  train_mtx <- as.matrix(dplyr::select(train_df, -target))
  train_target <- dplyr::pull(train_df, target)
  test_mtx <- as.matrix(test_df)
  
  if (task_specs$feature_set=="baseline" & task_specs$model=="baseline") {
    predictions <- test_df$baseline
    names(predictions) <- rownames(test_df)
  } else if (task_specs$model == "rf") {
    model <- ranger::ranger(formula = target ~ ., 
                            data = train_df, 
                            num.trees = 500, 
                            importance = "impurity")
    predictions <- predict(model, test_df)$predictions
    names(predictions) <- rownames(test_df)
    var_imp_list[[task_name]] <- 
      tibble(task=task_name,
             importance = model$variable.importance,
             feature = names(model$variable.importance)
      )
  } else if (task_specs$model == "rf+boruta") {
    boruta_out <- Boruta::Boruta(formula = target ~ .,
                                 data = train_df,
                                 maxRuns=1000,
                                 num.trees=500 
                                 )
    boruta_feats <- names(boruta_out$finalDecision)[boruta_out$finalDecision=="Confirmed"]
    stopifnot(length(boruta_feats) != 0)
  
    train_df_subset <- train_df %>% dplyr::select(dplyr::all_of(c(boruta_feats, "target")))
    test_df_subset <- test_df %>% dplyr::select(dplyr::all_of(c(boruta_feats, "target")))
    model <- ranger::ranger(formula = target ~ ., 
                            data = train_df_subset, 
                            num.trees = 500, 
                            importance = "impurity")
    predictions <- predict(model, test_df_subset)$predictions
    names(predictions) <- rownames(test_df)
    var_imp_list[[task_name]] <- 
      tibble(task=task_name,
             importance = model$variable.importance,
             feature = names(model$variable.importance)
      )
  } else if (task_specs$model == "lasso") {
    # standardize the features, even though glmnet is doing this automatically!
    # feature_means <- matrixStats::colMeans2(train_mtx)
    # feature_sds <- matrixStats::colSds(train_mtx)
    # train_mtx <- t((t(train_mtx) - feature_means) / feature_sds)
    # train_target <- (train_target - mean(train_target)) / sd(train_target)
    #test_mtx <- t((t(test_mtx) - feature_means) / feature_sds)
    
    # inner hyperparameter optim loop using LOOCV to find best lambda
    model <- glmnet::cv.glmnet(x=train_mtx,
                               y=train_target,
                               nfolds=nrow(train_mtx),
                               grouped=FALSE,
                               alpha=1, # LASSO
                               type.measure="mae", # not sure whether this is better
                               standardize=TRUE, # by default each feature is standardized internally
                               family="gaussian"
                               )
    # investigate
    #beta_mtx <- as.data.frame(as.matrix(model$glmnet.fit$beta))
    #colnames(beta_mtx) <- paste0("lambda_", round(model$lambda, 3))
    #plot(model)
    #lambda_value <- model$lambda.1se, # model$lambda.min
    # never use zero features (with lamba=model$lambda[1]), since constant predictions
    # will have NA spearman...
    lambda_value <- ifelse(model$lambda.min==model$lambda[1], 
                           model$lambda[2], model$lambda.min)
    predictions <- predict(model,
                           newx=test_mtx, 
                           s=lambda_value)[,1]
    coefs <- model$glmnet.fit$beta[, lambda_value==model$glmnet.fit$lambda]
    var_imp_list[[task_name]] <- 
      tibble(task=task_name,
             importance = abs(coefs),
             feature = names(coefs)
      )
  } else if (task_specs$model == "elnet") {
    # standardize the features, even though glmnet is doing this automatically!
    # feature_means <- matrixStats::colMeans2(train_mtx)
    # feature_sds <- matrixStats::colSds(train_mtx)
    # train_mtx <- t((t(train_mtx) - feature_means) / feature_sds)
    # train_target <- (train_target - mean(train_target)) / sd(train_target)
    #test_mtx <- t((t(test_mtx) - feature_means) / feature_sds)
    
    # inner hyperparameter optim loop using LOOCV to find best lambda
    model <- glmnet::cv.glmnet(x=train_mtx,
                               y=train_target,
                               nfolds=nrow(train_mtx),
                               grouped=FALSE,
                               alpha=0.5, # elastic net
                               type.measure="mae", # not sure whether this is better
                               standardize=TRUE, # by default each feature is standardized internally
                               family="gaussian"
                               )
    lambda_value <- ifelse(model$lambda.min==model$lambda[1], 
                           model$lambda[2], model$lambda.min)
    predictions <- predict(model,
                           newx=test_mtx, 
                           s=lambda_value)[,1]
    coefs <- model$glmnet.fit$beta[, lambda_value==model$glmnet.fit$lambda]
    var_imp_list[[task_name]] <- 
      tibble(task=task_name,
             importance = abs(coefs),
             feature = names(coefs)
      )
  }
  
  stopifnot(all(names(predictions) == rownames(test_df)))
  
  predictions_list[[task_name]] <- 
    tibble::tibble(subject_id = as.numeric(names(predictions)), 
                   predictions = predictions,
                   task_name = task_name,
                   model = task_specs$model,
                   feature_set = task_specs$feature_set)
}

[1] "task_11"
[1] "baseline"
[1] "baseline"

inner_join: added one column (baseline)
            > rows only in x          ( 0)
            > rows only in feature_df (55)
            > matched rows             52
            >                         ====
            > rows total               52

[1] "task_12"
[1] "lasso"
[1] "pbmc_cell_frequency+t_cell_polarization+metadata+baseline"

inner_join: added 20 columns (pbmc_cell_frequency_Basophils, pbmc_cell_frequency_Bcells, pbmc_cell_frequency_CD4Tcells, pbmc_cell_frequency_CD8Tcells, pbmc_cell_frequency_Classical_Monocytes, ...)
            > rows only in x          ( 1)
            > rows only in feature_df (71)
            > matched rows             51
            >                         ====
            > rows total               51

[1] "task_21"
[1] "rf"
[1] "pbmc_cell_frequency_pca+plasma_cytokine_concentration_by_legendplex_pca+t_cell_activation_pca+metadata+baseline"

inner_join: added 20 columns (PC1.x, PC2.x, PC3.x, PC4.x, PC5.x, ...)
            > rows only in x          (15)
            > rows only in feature_df (57)
            > matched rows             51
            >                         ====
            > rows total               51
Baseline values are missing for 6 samples! Imputing with Median

[1] "task_22"
[1] "elnet"
[1] "plasma_cytokine_concentration_by_legendplex+t_cell_activation+t_cell_polarization+metadata+baseline"

inner_join: added 28 columns (plasma_cytokine_concentration_by_legendplex_P01375, plasma_cytokine_concentration_by_legendplex_P01563, plasma_cytokine_concentration_by_legendplex_P01579, plasma_cytokine_concentration_by_legendplex_P02778, plasma_cytokine_concentration_by_legendplex_P05231, ...)
            > rows only in x          (16)
            > rows only in feature_df (57)
            > matched rows             50
            >                         ====
            > rows total               50
Baseline values are missing for 6 samples! Imputing with Median

[1] "task_31"
[1] "lasso"
[1] "plasma_cytokine_concentration_by_legendplex+t_cell_polarization+metadata+baseline"

inner_join: added 24 columns (plasma_cytokine_concentration_by_legendplex_P01375, plasma_cytokine_concentration_by_legendplex_P01563, plasma_cytokine_concentration_by_legendplex_P01579, plasma_cytokine_concentration_by_legendplex_P02778, plasma_cytokine_concentration_by_legendplex_P05231, ...)
            > rows only in x          (29)
            > rows only in feature_df (54)
            > matched rows             54
            >                         ====
            > rows total               54
Baseline values are missing for 1 samples! Imputing with Median

[1] "task_32"
[1] "rf"
[1] "plasma_ab_titer_pca+metadata+baseline"

inner_join: added 20 columns (PC1, PC2, PC3, PC4, PC5, ...)
            > rows only in x          (30)
            > rows only in feature_df (54)
            > matched rows             53
            >                         ====
            > rows total               53
Baseline values are missing for 1 samples! Imputing with Median

[1] "task_41"
[1] "lasso"
[1] "pbmc_cell_frequency+plasma_cytokine_concentration_by_legendplex+t_cell_activation+metadata+baseline"

inner_join: added 32 columns (pbmc_cell_frequency_Basophils, pbmc_cell_frequency_Bcells, pbmc_cell_frequency_CD4Tcells, pbmc_cell_frequency_CD8Tcells, pbmc_cell_frequency_Classical_Monocytes, ...)
            > rows only in x          ( 1)
            > rows only in feature_df (66)
            > matched rows             42
            >                         ====
            > rows total               42
Baseline values are missing for 3 samples! Imputing with Median

Code

names(predictions_list)

[1] "task_11" "task_12" "task_21" "task_22" "task_31" "task_32" "task_41"

Code

submission_df_with_predictions <- submission_df

for (task_name in names(predictions_list)) {
  task_preds_df <- predictions_list[[task_name]] %>%
    dplyr::select(subject_id, predictions)
  
  submission_df_with_predictions <- submission_df_with_predictions %>%
    tidylog::left_join(task_preds_df, by=c("SubjectID" = "subject_id")) %>%
    dplyr::mutate(!!prediction_specs[[task_name]][["table_col"]] := predictions) %>%
    dplyr::select(-predictions)
}

left_join: added one column (predictions)
           > rows only in x               0
           > rows only in task_preds_df ( 0)
           > matched rows                54
           >                            ====
           > rows total                  54
left_join: added one column (predictions)
           > rows only in x               0
           > rows only in task_preds_df ( 0)
           > matched rows                54
           >                            ====
           > rows total                  54
left_join: added one column (predictions)
           > rows only in x               0
           > rows only in task_preds_df ( 0)
           > matched rows                54
           >                            ====
           > rows total                  54
left_join: added one column (predictions)
           > rows only in x               0
           > rows only in task_preds_df ( 0)
           > matched rows                54
           >                            ====
           > rows total                  54
left_join: added one column (predictions)
           > rows only in x               0
           > rows only in task_preds_df ( 0)
           > matched rows                54
           >                            ====
           > rows total                  54
left_join: added one column (predictions)
           > rows only in x               0
           > rows only in task_preds_df ( 0)
           > matched rows                54
           >                            ====
           > rows total                  54
left_join: added one column (predictions)
           > rows only in x               0
           > rows only in task_preds_df ( 0)
           > matched rows                54
           >                            ====
           > rows total                  54

Code

submission_df_with_predictions

Lastly we need to create ranks from those predictions, see:

https://discuss.cmi-pb.org/t/ranking-results-clarification/321
We expect contestants to generate computational models, and upon making predictions, these values are ranked from highest to lowest (i.e. highest = 1, lowest = N) before making a final submission.

Code

submission_df_with_rank <- submission_df_with_predictions

task_titles <- purrr::map_chr(prediction_specs, ~ .x[["table_col"]])
stopifnot(all(task_titles %in% colnames(submission_df_with_predictions)))

for (task_title_oi in task_titles) {
  submission_df_with_rank[[task_title_oi]] <- 
    rank(submission_df_with_predictions[[task_title_oi]], ties.method="average")
}
readr::write_tsv(
  x = submission_df_with_rank, 
  file = file.path(output_dir, paste0(format(Sys.Date(), "%Y-%m-%d"), "_submission_psls.tsv"))
)
submission_df_with_rank

Then we can also look at the most important features

Code

var_imp_list %>%
  dplyr::bind_rows() %>%
  dplyr::group_by(task) %>%
  dplyr::slice_max(order_by=importance, n=10, with_ties = FALSE) %>%
  dplyr::filter(importance > 0)

Code

var_imp_list %>%
  dplyr::bind_rows() %>%
  dplyr::group_by(task) %>%
  dplyr::slice_max(order_by=importance, n=10, with_ties = FALSE) %>%
  dplyr::filter(importance > 0) %>%
  dplyr::summarise(features = paste0(feature, collapse=", ")) %>%
  readr::write_delim(., file=file.path(output_dir, paste0(format(Sys.Date(), "%Y-%m-%d"), "_var_imp_df.tsv")), delim="\t")

9 Appendix

Also write the image just in case

Code

save.image(file=file.path(output_dir, paste0(format(Sys.Date(), "%Y-%m-%d"), "_model_selection_session.RData")))
#load(file=file.path(output_dir, "2024-11-11_model_selection_session.RData"))