Author

Philipp Sven Lars Schäfer

Published

November 28, 2024

1 Packages

Code
suppressPackageStartupMessages({
  library(tidyverse)
  library(flextable)
  library(ggdark)
  library(magick)
  library(glmnet)
  library(ranger)
  source(file.path("..", "src", "read_data.R"))
  source(file.path("..", "src", "colors.R"))
  source(file.path("..", "src", "generate_targets.R"))
  source(file.path("..", "src", "model.R"))
  source(file.path("..", "src", "normalize_integrate.R"))
})

2 Data

Code
input_dir = file.path("..", "data")
output_dir = file.path("..", "results", "model_selection_integrated")
dir.create(output_dir, showWarnings = FALSE, recursive = TRUE)
Code
celltype_meta <- read_celltype_meta(input_dir)
gene_meta <- read_gene_meta_plus(input_dir)
protein_meta <- read_protein_meta(input_dir)

meta_data <- read_harmonized_meta_data(input_dir)
specimen_per_day <- get_specimen_per_day(meta_data=meta_data)

RECOMPUTE <- TRUE
if (RECOMPUTE) {
  raw_experimental_data <- read_raw_experimental_data(input_dir)
  
  filtered_experimental_data <- filter_experimental_data(
    meta_data=meta_data, 
    experimental_data=raw_experimental_data,
    gene_meta=gene_meta)
  
  write_rds(filtered_experimental_data, 
            file = file.path(input_dir, "prc_datasets", 
                             "filtered_experimental_data.RDS"))
  
  specimen_to_assay <- purrr::map(list("raw"=raw_experimental_data, "filtered"=filtered_experimental_data), function(exp_data) {
    # exp_data <- filtered_experimental_data; exp_name = "filtered"
    purrr::imap(exp_data, function(exp_df, exp_name) {
      exp_df %>%
        dplyr::select(specimen_id) %>%
        dplyr::distinct() %>%
        dplyr::mutate(assay = exp_name, specimen_id = as.numeric(specimen_id))
    }) %>%
      dplyr::bind_rows() %>%
      dplyr::mutate(present = TRUE) %>%
      tidyr::pivot_wider(names_from=assay, values_from=present) %>%
      mutate_all(~ ifelse(is.na(.), FALSE, .))
  })
  
  meta_data_plus_assays <- purrr::map(specimen_to_assay, function(df) {
    meta_data %>%
      dplyr::left_join(df, by="specimen_id") %>%
      mutate_all(~ ifelse(is.na(.), FALSE, .))
  })
  
  write_rds(specimen_to_assay, 
            file = file.path(input_dir, "prc_datasets", 
                             "specimen_to_assay.RDS"))
  write_rds(meta_data_plus_assays, 
            file = file.path(input_dir, "prc_datasets", 
                             "meta_data_plus_assays.RDS"))
  
  normalized_experimental_data <- normalize_experimental_data(
    meta_data=meta_data, 
    raw_experimental_data=filtered_experimental_data,
    gene_meta=gene_meta)
  
  write_rds(normalized_experimental_data, 
            file = file.path(input_dir, "prc_datasets", 
                             "normalized_experimental_data.RDS"))
  
  integrated_experimental_data <- integrate_experimental_data(
    meta_data=meta_data, 
    normalized_experimental_data=normalized_experimental_data)
  
  write_rds(integrated_experimental_data, 
            file = file.path(input_dir, "prc_datasets", 
                             "integrated_experimental_data.RDS"))

  # use raw/filtered experimental data for computation of targets
  target_list <- generate_all_targets(
    meta_data=meta_data, 
    experimental_data=integrated_experimental_data, 
    experimental_data_settings=experimental_data_settings, 
    gene_meta=gene_meta,
    protein_meta=protein_meta
    )
  
  # use raw/filtered experimental data for computation of targets
  baseline_list <- generate_all_baselines(
    meta_data=meta_data, 
    experimental_data=integrated_experimental_data, 
    experimental_data_settings=experimental_data_settings, 
    gene_meta=gene_meta,
    protein_meta=protein_meta
    )
  
  stopifnot(identical(names(target_list), names(baseline_list)))
  purrr::map_lgl(names(target_list), function(task_name) {
    (target_list[[task_name]] %>%
      dplyr::inner_join(baseline_list[[task_name]], by="subject_id") %>%
      dplyr::summarise(cor = cor(baseline.x, baseline.y)) %>%
      dplyr::pull(cor)) == 1
  }) %>%
    all() %>%
    stopifnot()
  
  write_rds(target_list, file = file.path(input_dir, "prc_datasets", 
                                          "target_list.RDS"))
  
  write_rds(baseline_list, file = file.path(input_dir, "prc_datasets", 
                                            "baseline_list.RDS"))
  
  rm(raw_experimental_data, filtered_experimental_data)
  experimental_data <- integrated_experimental_data
  experimental_data <- experimental_data[-which(names(experimental_data) ==
                                                "pbmc_gene_expression_counts")]
} else {
  experimental_data <- read_rds(file = file.path(input_dir, "prc_datasets", 
                                                            "integrated_experimental_data.RDS"))
  experimental_data <- experimental_data[-which(names(experimental_data) ==
                                                "pbmc_gene_expression_counts")]
  
  target_list <- read_rds(file = file.path(input_dir, "prc_datasets",  "target_list.RDS"))
  
  baseline_list <- read_rds(file = file.path(input_dir, "prc_datasets", "baseline_list.RDS"))
  
  specimen_to_assay <- read_rds(file = file.path(input_dir, "prc_datasets", "specimen_to_assay.RDS"))
  
  meta_data_plus_assays <- read_rds(file = file.path(input_dir, "prc_datasets", "meta_data_plus_assays.RDS"))
}
pbmc_cell_frequency | Removed 550 specimens because missing in meta data
plasma_ab_titer | Removed 6931 specimens because missing in meta data
plasma_cytokine_concentration_by_olink | Removed 495 specimens because missing in meta data
pbmc_cell_frequency | Removed 56 features because not in feature subset
plasma_ab_titer | Removed 48 features because not in feature subset
plasma_cytokine_concentration_by_olink | Removed 234 features because not in feature subset
t_cell_polarization | Removed 3 features because not in feature subset
plasma_cytokine_concentration_by_olink | Removed 300 features because qc warning
plasma_ab_titer | Removed 10540 measurements because wrong unit used
plasma_cytokine_concentration_by_olink | Removed 2400 measurements because wrong unit used
plasma_cytokine_concentration_by_legendplex | Removed 8 because specimen is outlier
plasma_cytokine_concentration_by_olink | Removed specimen 750, 760, 824, 833, 894, 903 because fraction of measurements below LOQ > 50%
plasma_ab_titer | Removed specimen 674, 675, 676 because fraction of measurements below LOD > 50%
converting counts to integer mode
Found 4 batches
Using null model in ComBat-seq.
Adjusting for 0 covariate(s) or covariate level(s)
Estimating dispersions
Fitting the GLM model
Shrinkage off - using GLM estimates for parameters
Adjusting the data
Found3batches
Adjusting for0covariate(s) or covariate level(s)
Standardizing Data across genes
Fitting L/S model and finding priors
Finding parametric adjustments
Adjusting the Data

3 Tasks

Code
task_meta <- list(
  task_11 = list(
    name = "task_11",
    header = "## Task 1.1",
    description = "Rank the individuals by IgG antibody levels against pertussis toxin (PT) that we detect in plasma 14 days post booster vaccinations."
  ),
  task_12 = list(
    name = "task_12",
    header = "## Task 1.2",
    description = "Rank the individuals by fold change of IgG antibody levels against pertussis toxin (PT) that we detect in plasma 14 days post booster vaccinations compared to titer values at day 0."
  ),
  task_21 = list(
    name = "task_21",
    header = "## Task 2.1",
    description = "Rank the individuals by predicted frequency of Monocytes on day 1 post boost after vaccination."
  ),
  task_22 = list(
    name = "task_22",
    header = "## Task 2.2",
    description = "Rank the individuals by fold change of predicted frequency of Monocytes on day 1 post booster vaccination compared to cell frequency values at day 0."
  ),
  task_31 = list(
    name = "task_31",
    header = "## Task 3.1",
    description = "Rank the individuals by predicted gene expression of CCL3 on day 3 post-booster vaccination."
  ),
  task_32 = list(
    name = "task_32",
    header = "## Task 3.2",
    description = "Rank the individuals by fold change of predicted gene expression of CCL3 on day 3 post booster vaccination compared to gene expression values at day 0."
  ),
  task_41 = list(
    name = "task_41",
    header = "## Task 4.1",
    description = "Rank the individuals based on their Th1/Th2 (IFN-g/IL-5) polarization ratio on Day 30 post-booster vaccination."
  )
)

4 Rationale

  • How to choose the right model?

    • Top ranking performance

    • Low variance between the test sets

    • If several models with top performance and low variance, choose the more regularized model

    • If several models with top performance and low variance, choose model that is not using assay that is missing often in test data (-> Olink!)

5 Conclusions

TODO

6 Prepare Features

Code
# get the 2000 most variable genes based on the mean dispersion (variance/mean) across cohorts
N_HVG <- 2000
hvg <- gene_meta %>% 
  dplyr::mutate("prefix_versioned_ensembl_gene_id_clean" =
                  paste0("pbmc_gene_expression_", versioned_ensembl_gene_id_clean)) %>%
  dplyr::slice_max(mean_rank, n=N_HVG, with_ties=TRUE)

meta_data_cov <- get_metadata_covariates(meta_data) %>%
  dplyr::select(-c(ethnicity_Not_Hispanic_or_Latino, ethnicity_Hispanic_or_Latino))

experimental_predictors_day_0 <- 
  generate_wide_experimental_data(experimental_data=experimental_data,
                                  impute="knn", 
                                  max_NA_frac_sample = 0.5,
                                  max_NA_frac_feature = 0.25,
                                  verbose=TRUE) %>%
  purrr::imap(function(df, modality) {
    colnames(df) <- paste0(modality, "_", colnames(df)) # since feature names can be repeated across assays
    df <- as.data.frame(df) %>%
      tibble::rownames_to_column("specimen_id") %>%
      dplyr::mutate(specimen_id = as.numeric(specimen_id)) %>%
      dplyr::inner_join((specimen_per_day$day_0 %>% 
                           dplyr::select(subject_id, specimen_id, dataset)),
                        by="specimen_id") %>%
      dplyr::select(-c(specimen_id, dataset)) %>%
      dplyr::select(subject_id, everything())
    return(df)
  })
plasma_cytokine_concentration_by_olink | NA Fraction: 0.00288417166589755 | Imputed with knn imputation
t_cell_activation | NA Fraction: 0.0576923076923077 | Removed samples: 681
t_cell_activation | NA Fraction: 0.0553691275167785 | Imputed with knn imputation
t_cell_polarization | NA Fraction: 0.0134099616858238 | Imputed with knn imputation
Code
experimental_predictors_day_0$pbmc_gene_expression <-
  experimental_predictors_day_0$pbmc_gene_expression[, c("subject_id", hvg$prefix_versioned_ensembl_gene_id_clean)]

#purrr::map(experimental_predictors_day_0, ~ dim(.x))

experimental_predictors_day_0_pca <- experimental_predictors_day_0 %>%
  purrr::imap(function(df, modality) {
    # df <- experimental_predictors_day_0$
    mtx <- df %>% dplyr::select(-subject_id) %>% as.matrix()
    pca <- stats::prcomp(mtx)
    sdev_ratio <- cumsum(pca$sdev) / sum(pca$sdev)
    n_pcs <- max(which(sdev_ratio < 0.9))
    df_pca <- as.data.frame(pca$x[, 1:n_pcs]) %>%
      dplyr::mutate(subject_id = df$subject_id) %>%
      dplyr::select(subject_id, everything())
    return(df_pca)
})

modality_powerset <- purrr::map(1:length(experimental_predictors_day_0), function(i) {
  utils::combn(names(experimental_predictors_day_0), i, simplify = FALSE)
}) %>%
   unlist(recursive = FALSE)

#purrr::map(experimental_predictors_day_0_pca, ~ dim(.x))

meta_data_plus_assays_day_0 <- meta_data_plus_assays$filtered %>%
  dplyr::filter(specimen_id %in% specimen_per_day$day_0$specimen_id)
stopifnot(length(meta_data_plus_assays_day_0$subject_id) == 
            length(unique(meta_data_plus_assays_day_0$subject_id)))

Check which assays are missing the least often?

  • Olink is missing quite often, because I removed all measurements for the 2020 cohort
Code
# which assays are most/least often missing?
meta_data_plus_assays_day_0 %>%
  dplyr::select(dplyr::all_of(names(experimental_predictors_day_0))) %>%
  dplyr::mutate(row_sum = apply(select(., matches(".+")), 1, sum)) %>%
  dplyr::filter(row_sum > 0) %>%
  tidyr::pivot_longer(cols=!row_sum) %>%
  dplyr::group_by(name) %>%
  dplyr::summarise(mean = mean(value)) %>%
  dplyr::arrange(desc(mean))
Code
MAX_N_ASSAYS_MISSING <- 2
K_FOR_KNN_IMPUTE <- 7

feature_list <- list(
  "metadata" = meta_data_cov,
  "metadata+baseline" = meta_data_cov # will take care of that logic in loop
)

# check for each feature setting (aka modality setting)
# whether to many assays are missing for any subject 
# (by default we remove subject if more than 2 of the required feature sets are missing)
# usually we exclude here subject from the 2020 dataset
for (modality_set in modality_powerset) {
  # modality_set <- modality_powerset[[100]]
  modality_set_name <- paste0(paste0(modality_set, collapse = "_pca+"), "_pca+metadata+baseline")
  
  # inner join here, other subject have no assays for day 0...
  modality_set_check_assays  <- meta_data_cov %>%
    dplyr::inner_join(dplyr::select(meta_data_plus_assays_day_0, dplyr::all_of(c("subject_id", modality_set))), 
                        by="subject_id")
  modality_set_check_assays[, modality_set][is.na(modality_set_check_assays[, modality_set])] <- FALSE
  modality_set_check_assays[, "n_assays_present"] <- rowSums(modality_set_check_assays[, modality_set])
  modality_set_check_assays[, "n_assays_missing"] <- length(modality_set) - modality_set_check_assays[, "n_assays_present"]
  #modality_set_check_assays[, "n_assays_max"] <- length(modality_set)
  #modality_set_check_assays <- modality_set_check_assays %>% dplyr::mutate(n_assays_frac = n_assays_present / n_assays_max)
  
  subjects_to_exclude <- modality_set_check_assays %>%
    dplyr::filter(n_assays_missing >= !!MAX_N_ASSAYS_MISSING) %>%
    dplyr::pull(subject_id)
  
  modality_set_meta_df <- meta_data_cov %>%
    dplyr::filter(!(subject_id %in% subjects_to_exclude))
  
  modality_set_df <- 
    experimental_predictors_day_0_pca[modality_set] %>%
    purrr::reduce(full_join, by="subject_id") %>%
    dplyr::inner_join(modality_set_meta_df, by="subject_id")
  
  modality_set_mtx <- modality_set_df %>%
    tibble::column_to_rownames("subject_id") %>%
    as.matrix() %>%
    t() %>%
    impute::impute.knn(data=., k=K_FOR_KNN_IMPUTE, maxp=10000, colmax=1) %>%
    .$data %>%
    t()
  
  feature_list[[modality_set_name]] <- as.data.frame(modality_set_mtx) %>%
    tibble::rownames_to_column("subject_id") %>%
    dplyr::mutate(subject_id = as.numeric(subject_id))
}

# adding power set of single-omic predictors
for (modality_set in modality_powerset) {
  # modality_set <- modality_powerset[[100]]
  modality_set_name <- paste0(paste0(modality_set, collapse = "+"), "+metadata+baseline")
  
# inner join here, other subject have no assays for day 0...
  modality_set_check_assays  <- meta_data_cov %>%
    dplyr::inner_join(dplyr::select(meta_data_plus_assays_day_0, dplyr::all_of(c("subject_id", modality_set))), 
                        by="subject_id")
  modality_set_check_assays[, modality_set][is.na(modality_set_check_assays[, modality_set])] <- FALSE
  modality_set_check_assays[, "n_assays_present"] <- rowSums(modality_set_check_assays[, modality_set])
  modality_set_check_assays[, "n_assays_missing"] <- length(modality_set) - modality_set_check_assays[, "n_assays_present"]
  #modality_set_check_assays[, "n_assays_max"] <- length(modality_set)
  #modality_set_check_assays <- modality_set_check_assays %>% dplyr::mutate(n_assays_frac = n_assays_present / n_assays_max)
  
  
  subjects_to_exclude <- modality_set_check_assays %>%
    dplyr::filter(n_assays_missing >= !!MAX_N_ASSAYS_MISSING) %>%
    dplyr::pull(subject_id)
  
  modality_set_meta_df <- meta_data_cov %>%
    dplyr::filter(!(subject_id %in% subjects_to_exclude))
  
  modality_set_df <- 
    experimental_predictors_day_0[modality_set] %>%
    purrr::reduce(full_join, by="subject_id") %>%
    dplyr::inner_join(modality_set_meta_df, by="subject_id")
  
  modality_set_mtx <- modality_set_df %>%
    tibble::column_to_rownames("subject_id") %>%
    as.matrix() %>%
    t() %>%
    impute::impute.knn(data=., k=K_FOR_KNN_IMPUTE, maxp=10000, colmax=1) %>%
    .$data %>%
    t()
  
  feature_list[[modality_set_name]] <- as.data.frame(modality_set_mtx) %>%
    tibble::rownames_to_column("subject_id") %>%
    dplyr::mutate(subject_id = as.numeric(subject_id))
}

stopifnot(all(purrr::map_dbl(feature_list, ~ mean(is.na(.x))) == 0))

#purrr::map(feature_list, ~ dim(.x))

7 Model Evaluation / Assessment

Code
set.seed(42)

tictoc::tic()

result_list <-  list()

for (task in task_meta) {
  # task <- task_meta[[1]]
  
  task_df <- target_list[[task$name]]
  
  target_df <- task_df %>%
    dplyr::select(subject_id, target)
  
  baseline_df <- task_df %>% 
    dplyr::select(subject_id, baseline)
  
  baseline_model_df <- task_df %>%
    dplyr::left_join(dplyr::distinct(dplyr::select(meta_data, c(subject_id, dataset))),
                     by="subject_id")
  
  all_datasets <- unique(baseline_model_df$dataset)
  
  for (test_dataset in all_datasets) {
    result_list <- rlist::list.append(result_list, tibble::tibble(
      task = task$name,
      trainset = NA,
      testset = test_dataset,
      test_subset = task_df$subject_id[baseline_model_df$dataset==test_dataset],
      target = task_df$target[baseline_model_df$dataset==test_dataset], 
      prediction = task_df$baseline[baseline_model_df$dataset==test_dataset],
      testset_mean = NA,
      model = "baseline",
      feature_set = "baseline"
    ))
  }
  # for (model_name in c("lasso", "elnet", "rf", "rf+boruta")) {
  for (model_name in c("lasso", "elnet", "rf")) {
    # model_name <- "lasso"
    for (feature_name in names(feature_list)) {
      # feature_name <- names(feature_list)[2]
      feature_df <- feature_list[[feature_name]]
      
      if (str_detect(feature_name, "baseline")) {
        feature_df <- feature_df %>%
          dplyr::left_join(baseline_df, by="subject_id")
      }
      
      model_df <- target_df %>%
        dplyr::left_join(feature_df, by="subject_id") %>%
        dplyr::left_join(dplyr::distinct(dplyr::select(meta_data, c(subject_id, dataset))),
                         by="subject_id")
      
      #stopifnot(!any(is.na(model_df))) # guard rail
      model_df <- model_df %>% tidyr::drop_na()
      stopifnot(nrow(model_df)==nrow(dplyr::distinct(model_df)))
      
      all_datasets <- unique(model_df$dataset)
      
      cross_dataset_out <- purrr::map(all_datasets, function(test_dataset) {
        # test_dataset <- all_datasets[2]
        train_dataset <- all_datasets[!all_datasets %in% test_dataset]
        
        train_df <- model_df %>%
          dplyr::filter(dataset %in% train_dataset) %>%
          dplyr::select(-c(dataset, subject_id))
        
        train_mtx <- as.matrix(dplyr::select(train_df, -target))
        train_target <- dplyr::pull(train_df, target)
        
        test_df_with_subject <- model_df %>%
          dplyr::filter(dataset %in% test_dataset)
        
        test_df <- test_df_with_subject %>%
          dplyr::select(-c(dataset, subject_id))
        
        test_mtx <- as.matrix(dplyr::select(test_df, -target))
        test_target <- dplyr::pull(test_df, target)
        
        if (model_name == "rf") {
          model <- ranger::ranger(formula = target ~ ., 
                                  data = train_df, 
                                  num.trees = 500)
          predictions <- predict(model, test_df)$predictions
        } 
        else if (model_name == "rf+boruta") {
          if (ncol(train_df) > 2) {
            boruta_out <- Boruta::Boruta(formula = target ~ .,
                                         data = train_df,
                                         maxRuns=1000,
                                         num.trees=500 
                                         )
            boruta_feats <- names(boruta_out$finalDecision)[boruta_out$finalDecision=="Confirmed"]
          } else {
            boruta_feats <- colnames(train_df)[colnames(train_df)!="target"]
          }
          if (length(boruta_feats) == 0) {
            predictions <- rep(mean(train_df$target), nrow(test_df))
          } else {
            train_df_subset <- train_df %>% dplyr::select(dplyr::all_of(c(boruta_feats, "target")))
            test_df_subset <- test_df %>% dplyr::select(dplyr::all_of(c(boruta_feats, "target")))
            model <- ranger::ranger(formula = target ~ ., 
                                    data = train_df_subset, 
                                    num.trees = 500)
            predictions <- predict(model, test_df_subset)$predictions
          }
        } 
        else if (model_name == "lasso") {
          # standardize the features, even though glmnet is doing this automatically!
          # feature_means <- matrixStats::colMeans2(train_mtx)
          # feature_sds <- matrixStats::colSds(train_mtx)
          # train_mtx <- t((t(train_mtx) - feature_means) / feature_sds)
          # train_target <- (train_target - mean(train_target)) / sd(train_target)
          #test_mtx <- t((t(test_mtx) - feature_means) / feature_sds)
          
          # inner hyperparameter optim loop using LOOCV to find best lambda
          model <- glmnet::cv.glmnet(x=train_mtx,
                                     y=train_target,
                                     nfolds=nrow(train_mtx),
                                     grouped=FALSE,
                                     alpha=1, # LASSO
                                     type.measure="mae", # not sure whether this is better
                                     standardize=TRUE, # by default each feature is standardized internally
                                     family="gaussian"
                                     )
          # investigate
          #beta_mtx <- as.data.frame(as.matrix(model$glmnet.fit$beta))
          #colnames(beta_mtx) <- paste0("lambda_", round(model$lambda, 3))
          #plot(model)
          #lambda_value <- model$lambda.1se, # model$lambda.min
          # never use zero features (with lamba=model$lambda[1]), since constant predictions
          # will have NA spearman...
          lambda_value <- ifelse(model$lambda.min==model$lambda[1], 
                                 model$lambda[2], model$lambda.min)
          predictions <- predict(model,
                                 newx=test_mtx, 
                                 s=lambda_value)[,1]
        }
        else if (model_name == "elnet") {
          # standardize the features, even though glmnet is doing this automatically!
          # feature_means <- matrixStats::colMeans2(train_mtx)
          # feature_sds <- matrixStats::colSds(train_mtx)
          # train_mtx <- t((t(train_mtx) - feature_means) / feature_sds)
          # train_target <- (train_target - mean(train_target)) / sd(train_target)
          #test_mtx <- t((t(test_mtx) - feature_means) / feature_sds)
          
          # inner hyperparameter optim loop using LOOCV to find best lambda
          model <- glmnet::cv.glmnet(x=train_mtx,
                                     y=train_target,
                                     nfolds=nrow(train_mtx),
                                     grouped=FALSE,
                                     alpha=0.5, # elastic net
                                     type.measure="mae", # not sure whether this is better
                                     standardize=TRUE, # by default each feature is standardized internally
                                     family="gaussian"
                                     )
          lambda_value <- ifelse(model$lambda.min==model$lambda[1], 
                                 model$lambda[2], model$lambda.min)
          predictions <- predict(model,
                                 newx=test_mtx, 
                                 s=lambda_value)[,1]
        }
        else {
          stop(paste0(model_name), " not implemented")
        }
        
        tibble::tibble(task=task$name,
                       trainset = paste0(train_dataset, collapse="__"),
                       testset = test_dataset,
                       test_subset = test_df_with_subject$subject_id,
                       target = test_df$target, 
                       prediction = predictions,
                       testset_mean = mean(test_df$target),
                       model = model_name,
                       feature_set = feature_name)
      }) %>%
        dplyr::bind_rows()
      result_list <- rlist::list.append(result_list, cross_dataset_out)
    }
  }
}
Warning: from glmnet C++ code (error code -99); Convergence for 99th lambda
value not reached after maxit=100000 iterations; solutions for larger lambdas
returned
Warning: from glmnet C++ code (error code -93); Convergence for 93th lambda
value not reached after maxit=100000 iterations; solutions for larger lambdas
returned
Warning: from glmnet C++ code (error code -98); Convergence for 98th lambda
value not reached after maxit=100000 iterations; solutions for larger lambdas
returned
Warning: from glmnet C++ code (error code -94); Convergence for 94th lambda
value not reached after maxit=100000 iterations; solutions for larger lambdas
returned
Warning: from glmnet C++ code (error code -91); Convergence for 91th lambda
value not reached after maxit=100000 iterations; solutions for larger lambdas
returned
Warning: from glmnet C++ code (error code -97); Convergence for 97th lambda
value not reached after maxit=100000 iterations; solutions for larger lambdas
returned
Warning: from glmnet C++ code (error code -91); Convergence for 91th lambda
value not reached after maxit=100000 iterations; solutions for larger lambdas
returned
Warning: from glmnet C++ code (error code -95); Convergence for 95th lambda
value not reached after maxit=100000 iterations; solutions for larger lambdas
returned
Code
result_df <- dplyr::bind_rows(result_list)
result_df
Code
tictoc::toc()
947.181 sec elapsed
Code
result_summary <- result_df %>%
    dplyr::group_by(task, feature_set, model, trainset, testset) %>%
    dplyr::summarise(
      test_mse = get_mse(target, prediction),
      test_r2 = get_r2(target, prediction),
      test_srho = get_spearman(target, prediction),
      mse_same_cohort = get_mse(target, testset_mean),
      .groups = "drop"
    )
result_summary

Why is that? This happens when the boruta algorithm returns 0 important features, which means in the test set the predictions are set to some constant. And since a constant vector has 0 standard deviation, we cannot compute the correlation (where we divide by the standard deviation).

Code
result_summary %>%
  dplyr::filter(is.na(test_srho)) %>%
  dplyr::count(task, model)
Code
model_selection_df <- result_summary %>%
  dplyr::select(task, feature_set, model, testset, test_srho) %>%
  tidyr::pivot_wider(values_from=test_srho, names_from=testset) %>%
  dplyr::mutate(srho_mean = apply(select(., matches("[0-9]+_dataset")), 1, mean, na.rm=TRUE)) %>%
  dplyr::filter(!is.na(srho_mean)) %>% # so we only keep entries where predictions in all test sets where constant
  dplyr::mutate(srho_min = apply(select(., matches("[0-9]+_dataset")), 1, min, na.rm=TRUE)) %>%
  dplyr::mutate(srho_sd = apply(select(., matches("[0-9]+_dataset")), 1, sd, na.rm=TRUE)) %>%
  dplyr::mutate(srho_mean = round(srho_mean, 3)) %>%
  dplyr::mutate(srho_min = round(srho_min, 3)) %>%
  dplyr::mutate(srho_sd = round(srho_sd, 3)) %>%
  dplyr::mutate(across(matches("[0-9]+_dataset"), ~ round(.x, 3))) %>%
  dplyr::mutate(dim_red = dplyr::case_when(
    str_detect(feature_set, "_pca") ~ "pca",
    TRUE ~ "none"
  )) %>%
  dplyr::mutate(n_modalities = (stringr::str_count(feature_set, "\\+") + 1)) %>%
  dplyr::select(task, model, srho_mean, srho_sd, srho_min, !feature_set, feature_set)
Code
readr::write_delim(model_selection_df, file=file.path(output_dir, paste0(format(Sys.Date(), "%Y-%m-%d"), "_model_selection_df.tsv")), delim="\t")

#model_selection_df <- readr::read_delim(file.path(output_dir, "model_selection_df.tsv"), delim="\t")

library(openxlsx)
# see https://ycphs.github.io/openxlsx/reference/openxlsx_options.html
#base::options()
wb <- openxlsx::createWorkbook()
openxlsx::addWorksheet(wb, "results_unfiltered")
openxlsx::setColWidths(wb, sheet="results_unfiltered", cols = 1:10, widths = 15)
openxlsx::setColWidths(wb, sheet="results_unfiltered", cols = 11:12, widths = 100)
openxlsx::writeDataTable(wb, 
                         sheet = "results_unfiltered", 
                         x = model_selection_df, 
                         startRow = 1, 
                         startCol = 1,
                         tableStyle = "TableStyleDark1")

for (task in unique(model_selection_df$task)) {
  # task <- "task_21"
  task_result_df <- model_selection_df %>%
    dplyr::filter(task == !!task) %>%
    dplyr::filter(!is.na(srho_sd)) %>%
    dplyr::arrange(desc(srho_mean)) 
  openxlsx::addWorksheet(wb, task)
  openxlsx::setColWidths(wb, sheet=task, cols = 1:10, widths = 15)
  openxlsx::setColWidths(wb, sheet=task, cols = 11:12, widths = 100)
  openxlsx::writeDataTable(wb, 
                         sheet = task,
                         x = task_result_df, 
                         startRow = 1, 
                         startCol = 1,
                         tableStyle = "TableStyleDark1")
}
openxlsx::saveWorkbook(
  wb, 
  file=file.path(output_dir, paste0(format(Sys.Date(), "%Y-%m-%d"), "_model_selection_df.xlsx")), 
  overwrite = TRUE)

8 Prediction for 2023

  1. Load the submission template
Code
submission_df <- readr::read_tsv(file.path(input_dir, "3rdChallengeSubmissionTemplate_10032024.tsv"),
                                 show_col_types = FALSE)
submission_df

Check which assays are missing most often

Code
test_assays_available <- submission_df %>%
  dplyr::select(SubjectID) %>%
  tidylog::left_join(meta_data_plus_assays_day_0, by=c("SubjectID"="subject_id"))
left_join: added 21 columns (specimen_id, actual_day_relative_to_boost, planned_day_relative_to_boost, visit, infancy_vac, ...)
           > rows only in x                           0
           > rows only in meta_data_plus_assays_d.. (98)
           > matched rows                            54
           >                                        ====
           > rows total                              54
Code
colMeans(test_assays_available[, names(experimental_predictors_day_0)]) %>% sort()
     plasma_cytokine_concentration_by_olink 
                                  0.5925926 
                        pbmc_cell_frequency 
                                  0.8888889 
                        t_cell_polarization 
                                  0.9444444 
                          t_cell_activation 
                                  0.9629630 
                       pbmc_gene_expression 
                                  0.9814815 
                            plasma_ab_titer 
                                  1.0000000 
plasma_cytokine_concentration_by_legendplex 
                                  1.0000000 
Code
rowSums(test_assays_available[, names(experimental_predictors_day_0)]) %>% sort()
 [1] 5 5 5 5 5 5 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 7 7 7 7 7 7 7 7 7 7
[39] 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7
  1. Define for each task a list with the following entries
Code
colnames(submission_df)
 [1] "SubjectID"                           "Age"                                
 [3] "BiologicalSexAtBirth"                "VaccinePrimingStatus"               
 [5] "1.1) IgG-PT-D14-titer-Rank"          "1.2) IgG-PT-D14-FC-Rank"            
 [7] "2.1) Monocytes-D1-Rank"              "2.2) Monocytes-D1-FC-Rank"          
 [9] "3.1) CCL3-D3-Rank"                   "3.2) CCL3-D3-FC-Rank"               
[11] "4.1) IFNG/IL5-Polarization-D30-Rank"
Code
prediction_specs <- list(
  "task_11" = list(
    "model" = "baseline",
    "feature_set" = "baseline",
    "table_col" = "1.1) IgG-PT-D14-titer-Rank"
  ),
  "task_12" = list(
    "model" = "lasso",
    "feature_set" = "pbmc_cell_frequency+t_cell_polarization+metadata+baseline",
    "table_col" = "1.2) IgG-PT-D14-FC-Rank"
  ),
  "task_21" = list(
    "model" = "rf",
    "feature_set" = "pbmc_cell_frequency_pca+plasma_cytokine_concentration_by_legendplex_pca+t_cell_activation_pca+metadata+baseline",
    "table_col" = "2.1) Monocytes-D1-Rank"
  ),
  "task_22" = list(
    "model" = "elnet",
    "feature_set" = "plasma_cytokine_concentration_by_legendplex+t_cell_activation+t_cell_polarization+metadata+baseline",
    "table_col" = "2.2) Monocytes-D1-FC-Rank"
  ),
  "task_31" = list(
    "model" = "lasso",
    "feature_set" = "plasma_cytokine_concentration_by_legendplex+t_cell_polarization+metadata+baseline",
    "table_col" = "3.1) CCL3-D3-Rank"
  ),
  "task_32" = list(
    "model" = "rf",
    "feature_set" = "plasma_ab_titer_pca+metadata+baseline",
    "table_col" = "3.2) CCL3-D3-FC-Rank"
  ),
  "task_41" = list(
    "model" = "lasso",
    "feature_set" = "pbmc_cell_frequency+plasma_cytokine_concentration_by_legendplex+t_cell_activation+metadata+baseline",
    "table_col" = "4.1) IFNG/IL5-Polarization-D30-Rank"
  )
)

# checks
## 0) all task column names are in submission table
purrr::map_lgl(prediction_specs, ~ .x[["table_col"]] %in% colnames(submission_df)) %>%
  all() %>%
  stopifnot()

## a) all tasks are specified
stopifnot(dplyr::setequal(names(task_meta), names(prediction_specs)))

## b) all features are present
purrr::map_lgl(prediction_specs, ~ length(str_split(.x[["feature_set"]], "\\+")[[1]]) >= 1) %>%
  all() %>%
  stopifnot()

purrr::map_lgl(prediction_specs, ~ .x[["feature_set"]] %in% c(names(feature_list), "baseline")) %>%
  all() %>%
  stopifnot()

## c) which features are never selected?
selected_features <- purrr::map(prediction_specs, ~ str_split(.x[["feature_set"]], "\\+")[[1]]) %>%
  unlist() %>%
  unique()

all_features <- purrr::map(names(feature_list), ~ str_split(.x, "\\+")[[1]]) %>%
  unlist() %>%
  unique()

all_features[!all_features %in% selected_features]
[1] "pbmc_gene_expression_pca"                  
[2] "plasma_cytokine_concentration_by_olink_pca"
[3] "t_cell_polarization_pca"                   
[4] "pbmc_gene_expression"                      
[5] "plasma_ab_titer"                           
[6] "plasma_cytokine_concentration_by_olink"    
Code
rm(selected_features, all_features)

## d) which features are most commonly used
purrr::map(prediction_specs, ~ str_split(.x[["feature_set"]], "\\+")[[1]]) %>%
  unlist() %>%
  stringr::str_remove_all(., "_pca$") %>%
  table() %>%
  sort(decreasing=TRUE)
.
                                   baseline 
                                          7 
                                   metadata 
                                          6 
plasma_cytokine_concentration_by_legendplex 
                                          4 
                        pbmc_cell_frequency 
                                          3 
                          t_cell_activation 
                                          3 
                        t_cell_polarization 
                                          3 
                            plasma_ab_titer 
                                          1 
Code
## e) all models are implemented
purrr::map_lgl(prediction_specs, ~ .x[["model"]] %in% c("rf", "rf+boruta", "elnet", "lasso", "baseline")) %>%
  all() %>%
  stopifnot()

## f) pull scores from result df
purrr::imap(prediction_specs, function(task_specs, task) {
  entry <- model_selection_df %>%
    dplyr::filter(task == !!task) %>%
    dplyr::filter(model == !!task_specs[["model"]]) %>%
    dplyr::filter(feature_set == !!task_specs[["feature_set"]])
  stopifnot(nrow(entry)==1)
  return(entry)
}) %>%
  dplyr::bind_rows()
  1. Make the predictions
  • Train model on full dataset

  • Make predictions for the 2023 data

Code
var_imp_list <- list()
predictions_list <- list()

for (task_name in names(prediction_specs)) {
  # task_name <- "task_12"
  task_specs <- prediction_specs[[task_name]]
  
  print(task_name)
  print(task_specs$model)
  print(task_specs$feature_set)
  
  task_df <- target_list[[task_name]]
    
  target_df <- task_df %>%
    dplyr::select(subject_id, target)
  
  baseline_df <- baseline_list[[task_name]]
  
  if (task_specs$feature_set=="baseline" & task_specs$model=="baseline") {
    feature_df <- baseline_df %>% dplyr::select(subject_id)
  } else {
    feature_df <- feature_list[[task_specs$feature_set]]
  }
  
  if (str_detect(task_specs$feature_set, "baseline")) {
    feature_df <- feature_df %>%
      dplyr::left_join(baseline_df, by="subject_id")
  }
  
  train_df <- target_df %>%
    tidylog::inner_join(feature_df, by="subject_id") %>%
    dplyr::left_join(dplyr::distinct(dplyr::select(meta_data, c(subject_id, dataset))),
                     by="subject_id") %>%
    dplyr::select(-c(dataset)) %>%
    tibble::column_to_rownames(var="subject_id")
  stopifnot(!any(is.na(train_df)))
  stopifnot(nrow(train_df)==nrow(dplyr::distinct(train_df)))
  # check that features were duplicated during joining of tables (i.e. features end with .x and .y) -> but doesn't make sense with the PCA joining...
  #stopifnot(all(purrr::map_dbl(stringr::str_split(colnames(train_df), "\\."), ~ length(.x)) == 1))
  #colnames(train_df)[purrr::map_dbl(stringr::str_split(colnames(train_df), "\\."), ~ length(.x)) > 1]
  
  test_df <- submission_df %>%
    dplyr::mutate(subject_id = SubjectID) %>% 
    dplyr::select(subject_id) %>%
    dplyr::left_join(feature_df, by="subject_id") %>%
    tibble::column_to_rownames(var="subject_id")
  
  # check if the baseline is missing for certain instances
  if (str_detect(task_specs$feature_set, "baseline")) {
    if (sum(is.na(test_df$baseline)) > 0) {
      message(paste0("Baseline values are missing for ", 
                   sum(is.na(test_df$baseline)),
                   " samples! Imputing with Median"))
      test_df$baseline[is.na(test_df$baseline)] <- median(test_df$baseline, na.rm=TRUE)
    }
  }
  
  # now there should be no more NAs!
  stopifnot(mean(is.na(test_df)) == 0)
  
  # some more checks
  stopifnot(identical(colnames(train_df)[colnames(train_df)!="target"], colnames(test_df)))
  #colnames(test_df)[!colnames(test_df) %in% colnames(train_df)]
  #colnames(train_df)[!colnames(train_df) %in% colnames(test_df)]
  
  # if we need matrix, vector objects for certain ML algos
  train_mtx <- as.matrix(dplyr::select(train_df, -target))
  train_target <- dplyr::pull(train_df, target)
  test_mtx <- as.matrix(test_df)
  
  if (task_specs$feature_set=="baseline" & task_specs$model=="baseline") {
    predictions <- test_df$baseline
    names(predictions) <- rownames(test_df)
  } else if (task_specs$model == "rf") {
    model <- ranger::ranger(formula = target ~ ., 
                            data = train_df, 
                            num.trees = 500, 
                            importance = "impurity")
    predictions <- predict(model, test_df)$predictions
    names(predictions) <- rownames(test_df)
    var_imp_list[[task_name]] <- 
      tibble(task=task_name,
             importance = model$variable.importance,
             feature = names(model$variable.importance)
      )
  } else if (task_specs$model == "rf+boruta") {
    boruta_out <- Boruta::Boruta(formula = target ~ .,
                                 data = train_df,
                                 maxRuns=1000,
                                 num.trees=500 
                                 )
    boruta_feats <- names(boruta_out$finalDecision)[boruta_out$finalDecision=="Confirmed"]
    stopifnot(length(boruta_feats) != 0)
  
    train_df_subset <- train_df %>% dplyr::select(dplyr::all_of(c(boruta_feats, "target")))
    test_df_subset <- test_df %>% dplyr::select(dplyr::all_of(c(boruta_feats, "target")))
    model <- ranger::ranger(formula = target ~ ., 
                            data = train_df_subset, 
                            num.trees = 500, 
                            importance = "impurity")
    predictions <- predict(model, test_df_subset)$predictions
    names(predictions) <- rownames(test_df)
    var_imp_list[[task_name]] <- 
      tibble(task=task_name,
             importance = model$variable.importance,
             feature = names(model$variable.importance)
      )
  } else if (task_specs$model == "lasso") {
    # standardize the features, even though glmnet is doing this automatically!
    # feature_means <- matrixStats::colMeans2(train_mtx)
    # feature_sds <- matrixStats::colSds(train_mtx)
    # train_mtx <- t((t(train_mtx) - feature_means) / feature_sds)
    # train_target <- (train_target - mean(train_target)) / sd(train_target)
    #test_mtx <- t((t(test_mtx) - feature_means) / feature_sds)
    
    # inner hyperparameter optim loop using LOOCV to find best lambda
    model <- glmnet::cv.glmnet(x=train_mtx,
                               y=train_target,
                               nfolds=nrow(train_mtx),
                               grouped=FALSE,
                               alpha=1, # LASSO
                               type.measure="mae", # not sure whether this is better
                               standardize=TRUE, # by default each feature is standardized internally
                               family="gaussian"
                               )
    # investigate
    #beta_mtx <- as.data.frame(as.matrix(model$glmnet.fit$beta))
    #colnames(beta_mtx) <- paste0("lambda_", round(model$lambda, 3))
    #plot(model)
    #lambda_value <- model$lambda.1se, # model$lambda.min
    # never use zero features (with lamba=model$lambda[1]), since constant predictions
    # will have NA spearman...
    lambda_value <- ifelse(model$lambda.min==model$lambda[1], 
                           model$lambda[2], model$lambda.min)
    predictions <- predict(model,
                           newx=test_mtx, 
                           s=lambda_value)[,1]
    coefs <- model$glmnet.fit$beta[, lambda_value==model$glmnet.fit$lambda]
    var_imp_list[[task_name]] <- 
      tibble(task=task_name,
             importance = abs(coefs),
             feature = names(coefs)
      )
  } else if (task_specs$model == "elnet") {
    # standardize the features, even though glmnet is doing this automatically!
    # feature_means <- matrixStats::colMeans2(train_mtx)
    # feature_sds <- matrixStats::colSds(train_mtx)
    # train_mtx <- t((t(train_mtx) - feature_means) / feature_sds)
    # train_target <- (train_target - mean(train_target)) / sd(train_target)
    #test_mtx <- t((t(test_mtx) - feature_means) / feature_sds)
    
    # inner hyperparameter optim loop using LOOCV to find best lambda
    model <- glmnet::cv.glmnet(x=train_mtx,
                               y=train_target,
                               nfolds=nrow(train_mtx),
                               grouped=FALSE,
                               alpha=0.5, # elastic net
                               type.measure="mae", # not sure whether this is better
                               standardize=TRUE, # by default each feature is standardized internally
                               family="gaussian"
                               )
    lambda_value <- ifelse(model$lambda.min==model$lambda[1], 
                           model$lambda[2], model$lambda.min)
    predictions <- predict(model,
                           newx=test_mtx, 
                           s=lambda_value)[,1]
    coefs <- model$glmnet.fit$beta[, lambda_value==model$glmnet.fit$lambda]
    var_imp_list[[task_name]] <- 
      tibble(task=task_name,
             importance = abs(coefs),
             feature = names(coefs)
      )
  }
  
  stopifnot(all(names(predictions) == rownames(test_df)))
  
  predictions_list[[task_name]] <- 
    tibble::tibble(subject_id = as.numeric(names(predictions)), 
                   predictions = predictions,
                   task_name = task_name,
                   model = task_specs$model,
                   feature_set = task_specs$feature_set)
}
[1] "task_11"
[1] "baseline"
[1] "baseline"
inner_join: added one column (baseline)
            > rows only in x          ( 0)
            > rows only in feature_df (55)
            > matched rows             52
            >                         ====
            > rows total               52
[1] "task_12"
[1] "lasso"
[1] "pbmc_cell_frequency+t_cell_polarization+metadata+baseline"
inner_join: added 20 columns (pbmc_cell_frequency_Basophils, pbmc_cell_frequency_Bcells, pbmc_cell_frequency_CD4Tcells, pbmc_cell_frequency_CD8Tcells, pbmc_cell_frequency_Classical_Monocytes, ...)
            > rows only in x          ( 1)
            > rows only in feature_df (71)
            > matched rows             51
            >                         ====
            > rows total               51
[1] "task_21"
[1] "rf"
[1] "pbmc_cell_frequency_pca+plasma_cytokine_concentration_by_legendplex_pca+t_cell_activation_pca+metadata+baseline"
inner_join: added 20 columns (PC1.x, PC2.x, PC3.x, PC4.x, PC5.x, ...)
            > rows only in x          (15)
            > rows only in feature_df (57)
            > matched rows             51
            >                         ====
            > rows total               51
Baseline values are missing for 6 samples! Imputing with Median
[1] "task_22"
[1] "elnet"
[1] "plasma_cytokine_concentration_by_legendplex+t_cell_activation+t_cell_polarization+metadata+baseline"
inner_join: added 28 columns (plasma_cytokine_concentration_by_legendplex_P01375, plasma_cytokine_concentration_by_legendplex_P01563, plasma_cytokine_concentration_by_legendplex_P01579, plasma_cytokine_concentration_by_legendplex_P02778, plasma_cytokine_concentration_by_legendplex_P05231, ...)
            > rows only in x          (16)
            > rows only in feature_df (57)
            > matched rows             50
            >                         ====
            > rows total               50
Baseline values are missing for 6 samples! Imputing with Median
[1] "task_31"
[1] "lasso"
[1] "plasma_cytokine_concentration_by_legendplex+t_cell_polarization+metadata+baseline"
inner_join: added 24 columns (plasma_cytokine_concentration_by_legendplex_P01375, plasma_cytokine_concentration_by_legendplex_P01563, plasma_cytokine_concentration_by_legendplex_P01579, plasma_cytokine_concentration_by_legendplex_P02778, plasma_cytokine_concentration_by_legendplex_P05231, ...)
            > rows only in x          (29)
            > rows only in feature_df (54)
            > matched rows             54
            >                         ====
            > rows total               54
Baseline values are missing for 1 samples! Imputing with Median
[1] "task_32"
[1] "rf"
[1] "plasma_ab_titer_pca+metadata+baseline"
inner_join: added 20 columns (PC1, PC2, PC3, PC4, PC5, ...)
            > rows only in x          (30)
            > rows only in feature_df (54)
            > matched rows             53
            >                         ====
            > rows total               53
Baseline values are missing for 1 samples! Imputing with Median
[1] "task_41"
[1] "lasso"
[1] "pbmc_cell_frequency+plasma_cytokine_concentration_by_legendplex+t_cell_activation+metadata+baseline"
inner_join: added 32 columns (pbmc_cell_frequency_Basophils, pbmc_cell_frequency_Bcells, pbmc_cell_frequency_CD4Tcells, pbmc_cell_frequency_CD8Tcells, pbmc_cell_frequency_Classical_Monocytes, ...)
            > rows only in x          ( 1)
            > rows only in feature_df (66)
            > matched rows             42
            >                         ====
            > rows total               42
Baseline values are missing for 3 samples! Imputing with Median
Code
names(predictions_list)
[1] "task_11" "task_12" "task_21" "task_22" "task_31" "task_32" "task_41"
Code
submission_df_with_predictions <- submission_df

for (task_name in names(predictions_list)) {
  task_preds_df <- predictions_list[[task_name]] %>%
    dplyr::select(subject_id, predictions)
  
  submission_df_with_predictions <- submission_df_with_predictions %>%
    tidylog::left_join(task_preds_df, by=c("SubjectID" = "subject_id")) %>%
    dplyr::mutate(!!prediction_specs[[task_name]][["table_col"]] := predictions) %>%
    dplyr::select(-predictions)
}
left_join: added one column (predictions)
           > rows only in x               0
           > rows only in task_preds_df ( 0)
           > matched rows                54
           >                            ====
           > rows total                  54
left_join: added one column (predictions)
           > rows only in x               0
           > rows only in task_preds_df ( 0)
           > matched rows                54
           >                            ====
           > rows total                  54
left_join: added one column (predictions)
           > rows only in x               0
           > rows only in task_preds_df ( 0)
           > matched rows                54
           >                            ====
           > rows total                  54
left_join: added one column (predictions)
           > rows only in x               0
           > rows only in task_preds_df ( 0)
           > matched rows                54
           >                            ====
           > rows total                  54
left_join: added one column (predictions)
           > rows only in x               0
           > rows only in task_preds_df ( 0)
           > matched rows                54
           >                            ====
           > rows total                  54
left_join: added one column (predictions)
           > rows only in x               0
           > rows only in task_preds_df ( 0)
           > matched rows                54
           >                            ====
           > rows total                  54
left_join: added one column (predictions)
           > rows only in x               0
           > rows only in task_preds_df ( 0)
           > matched rows                54
           >                            ====
           > rows total                  54
Code
submission_df_with_predictions

Lastly we need to create ranks from those predictions, see:

  • https://discuss.cmi-pb.org/t/ranking-results-clarification/321

  • We expect contestants to generate computational models, and upon making predictions, these values are ranked from highest to lowest (i.e. highest = 1, lowest = N) before making a final submission.

Code
submission_df_with_rank <- submission_df_with_predictions

task_titles <- purrr::map_chr(prediction_specs, ~ .x[["table_col"]])
stopifnot(all(task_titles %in% colnames(submission_df_with_predictions)))

for (task_title_oi in task_titles) {
  submission_df_with_rank[[task_title_oi]] <- 
    rank(submission_df_with_predictions[[task_title_oi]], ties.method="average")
}
readr::write_tsv(
  x = submission_df_with_rank, 
  file = file.path(output_dir, paste0(format(Sys.Date(), "%Y-%m-%d"), "_submission_psls.tsv"))
)
submission_df_with_rank

Then we can also look at the most important features

Code
var_imp_list %>%
  dplyr::bind_rows() %>%
  dplyr::group_by(task) %>%
  dplyr::slice_max(order_by=importance, n=10, with_ties = FALSE) %>%
  dplyr::filter(importance > 0)
Code
var_imp_list %>%
  dplyr::bind_rows() %>%
  dplyr::group_by(task) %>%
  dplyr::slice_max(order_by=importance, n=10, with_ties = FALSE) %>%
  dplyr::filter(importance > 0) %>%
  dplyr::summarise(features = paste0(feature, collapse=", ")) %>%
  readr::write_delim(., file=file.path(output_dir, paste0(format(Sys.Date(), "%Y-%m-%d"), "_var_imp_df.tsv")), delim="\t")

9 Appendix

Also write the image just in case

Code
save.image(file=file.path(output_dir, paste0(format(Sys.Date(), "%Y-%m-%d"), "_model_selection_session.RData")))
#load(file=file.path(output_dir, "2024-11-11_model_selection_session.RData"))