Single-Omic Models After Integration

Author

Philipp Sven Lars Schäfer

Published

November 28, 2024

1 Packages

Code
suppressPackageStartupMessages({
  library(tidyverse)
  library(flextable)
  library(ggdark)
  library(magick)
  source(file.path("..", "src", "read_data.R"))
  source(file.path("..", "src", "colors.R"))
  source(file.path("..", "src", "generate_targets.R"))
  source(file.path("..", "src", "model.R"))
})

2 Data

Code
input_dir = file.path("..", "data")
Code
celltype_meta <- read_celltype_meta(input_dir)
gene_meta <- read_gene_meta_plus(input_dir)
protein_meta <- read_protein_meta(input_dir)

meta_data <- read_harmonized_meta_data(input_dir)
specimen_per_day <- get_specimen_per_day(meta_data=meta_data)

RECOMPUTE <- TRUE
if (RECOMPUTE) {
  source(file.path("..", "src", "normalize_integrate.R"))
  
  raw_experimental_data <- read_raw_experimental_data(input_dir)
  
  filtered_experimental_data <- filter_experimental_data(
    meta_data=meta_data, 
    experimental_data=raw_experimental_data,
    gene_meta=gene_meta)
  
  write_rds(filtered_experimental_data, 
            file = file.path(input_dir, "prc_datasets", 
                             "filtered_experimental_data.RDS"))
  
  normalized_experimental_data <- normalize_experimental_data(
    meta_data=meta_data, 
    raw_experimental_data=filtered_experimental_data,
    gene_meta=gene_meta)
  
  write_rds(normalized_experimental_data, 
            file = file.path(input_dir, "prc_datasets", 
                             "normalized_experimental_data.RDS"))
  
  integrated_experimental_data <- integrate_experimental_data(
    meta_data=meta_data, 
    normalized_experimental_data=normalized_experimental_data)
  
  write_rds(integrated_experimental_data, 
            file = file.path(input_dir, "prc_datasets", 
                             "integrated_experimental_data.RDS"))

  # use raw/filtered experimental data for computation of targets
  target_list <- generate_all_targets(
    meta_data=meta_data, 
    experimental_data=filtered_experimental_data, 
    experimental_data_settings=experimental_data_settings, 
    gene_meta=gene_meta,
    protein_meta=protein_meta
    )
  
  write_rds(target_list, 
          file = file.path(input_dir, "prc_datasets", 
                           "target_list.RDS"))
  
  rm(raw_experimental_data, filtered_experimental_data, normalized_experimental_data)
  experimental_data <- integrated_experimental_data
} else {
  experimental_data <- read_rds(file = file.path(input_dir, "prc_datasets", 
                                                            "integrated_experimental_data.RDS"))
  target_list <- read_rds(file = file.path(input_dir, "prc_datasets", 
                                           "target_list.RDS"))
}
Loading required package: S4Vectors
Loading required package: stats4
Loading required package: BiocGenerics

Attaching package: 'BiocGenerics'
The following object is masked from 'package:flextable':

    width
The following objects are masked from 'package:lubridate':

    intersect, setdiff, union
The following objects are masked from 'package:dplyr':

    combine, intersect, setdiff, union
The following objects are masked from 'package:stats':

    IQR, mad, sd, var, xtabs
The following objects are masked from 'package:base':

    Filter, Find, Map, Position, Reduce, anyDuplicated, aperm, append,
    as.data.frame, basename, cbind, colnames, dirname, do.call,
    duplicated, eval, evalq, get, grep, grepl, intersect, is.unsorted,
    lapply, mapply, match, mget, order, paste, pmax, pmax.int, pmin,
    pmin.int, rank, rbind, rownames, sapply, setdiff, sort, table,
    tapply, union, unique, unsplit, which.max, which.min

Attaching package: 'S4Vectors'
The following objects are masked from 'package:lubridate':

    second, second<-
The following objects are masked from 'package:dplyr':

    first, rename
The following object is masked from 'package:tidyr':

    expand
The following objects are masked from 'package:base':

    I, expand.grid, unname
Loading required package: IRanges

Attaching package: 'IRanges'
The following object is masked from 'package:lubridate':

    %within%
The following objects are masked from 'package:dplyr':

    collapse, desc, slice
The following object is masked from 'package:purrr':

    reduce
Loading required package: GenomicRanges
Loading required package: GenomeInfoDb
Loading required package: SummarizedExperiment
Loading required package: MatrixGenerics
Loading required package: matrixStats

Attaching package: 'matrixStats'
The following object is masked from 'package:dplyr':

    count

Attaching package: 'MatrixGenerics'
The following objects are masked from 'package:matrixStats':

    colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
    colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
    colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
    colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
    colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
    colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
    colWeightedMeans, colWeightedMedians, colWeightedSds,
    colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
    rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
    rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
    rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
    rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
    rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
    rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
    rowWeightedSds, rowWeightedVars
Loading required package: Biobase
Welcome to Bioconductor

    Vignettes contain introductory material; view with
    'browseVignettes()'. To cite Bioconductor, see
    'citation("Biobase")', and for packages 'citation("pkgname")'.

Attaching package: 'Biobase'
The following object is masked from 'package:MatrixGenerics':

    rowMedians
The following objects are masked from 'package:matrixStats':

    anyMissing, rowMedians
Loading required package: mgcv
Loading required package: nlme

Attaching package: 'nlme'
The following object is masked from 'package:IRanges':

    collapse
The following object is masked from 'package:dplyr':

    collapse
This is mgcv 1.9-1. For overview type 'help("mgcv-package")'.
Loading required package: genefilter

Attaching package: 'genefilter'
The following objects are masked from 'package:MatrixGenerics':

    rowSds, rowVars
The following objects are masked from 'package:matrixStats':

    rowSds, rowVars
The following object is masked from 'package:readr':

    spec
Loading required package: BiocParallel
pbmc_cell_frequency | Removed 550 specimens because missing in meta data
plasma_ab_titer | Removed 6931 specimens because missing in meta data
plasma_cytokine_concentration_by_olink | Removed 495 specimens because missing in meta data
pbmc_cell_frequency | Removed 56 features because not in feature subset
plasma_ab_titer | Removed 48 features because not in feature subset
plasma_cytokine_concentration_by_olink | Removed 234 features because not in feature subset
t_cell_polarization | Removed 3 features because not in feature subset
plasma_cytokine_concentration_by_olink | Removed 300 features because qc warning
plasma_ab_titer | Removed 10540 measurements because wrong unit used
plasma_cytokine_concentration_by_olink | Removed 2400 measurements because wrong unit used
plasma_cytokine_concentration_by_legendplex | Removed 8 because specimen is outlier
plasma_cytokine_concentration_by_olink | Removed specimen 750, 760, 824, 833, 894, 903 because fraction of measurements below LOQ > 50%
plasma_ab_titer | Removed specimen 674, 675, 676 because fraction of measurements below LOD > 50%
converting counts to integer mode
Found 4 batches
Using null model in ComBat-seq.
Adjusting for 0 covariate(s) or covariate level(s)
Estimating dispersions
Fitting the GLM model
Shrinkage off - using GLM estimates for parameters
Adjusting the data
Found3batches
Adjusting for0covariate(s) or covariate level(s)
Standardizing Data across genes
Fitting L/S model and finding priors
Finding parametric adjustments
Adjusting the Data
Code
experimental_data <- 
  experimental_data[-which(names(experimental_data) == "pbmc_gene_expression_counts")]

experimental_predictors <- generate_wide_experimental_data(experimental_data=experimental_data,
                                                            impute="median", 
                                                            verbose=TRUE)
plasma_cytokine_concentration_by_olink | NA Fraction: 0.00288417166589755 | Imputed with median imputation
t_cell_activation | NA Fraction: 0.0576923076923077 | Removed samples: 681
t_cell_activation | NA Fraction: 0.0553691275167785 | Imputed with median imputation
t_cell_polarization | NA Fraction: 0.0134099616858238 | Imputed with median imputation
Code
hvg <- gene_meta %>% 
  dplyr::slice_max(mean_rank, n=1000)
experimental_predictors$pbmc_gene_expression <- 
  experimental_predictors$pbmc_gene_expression[, hvg$versioned_ensembl_gene_id_clean]

3 Conclusions

  • Not sure whether this is a bug, but the performance of models using either normalized or batch corrected data is almost the same, except for olink features (and probably also gex data)

4 Questions

5 Results

Code
task_meta <- list(
  task_11 = list(
    name = "task_11",
    header = "## Task 1.1",
    description = "Rank the individuals by IgG antibody levels against pertussis toxin (PT) that we detect in plasma 14 days post booster vaccinations."
  ),
  task_12 = list(
    name = "task_12",
    header = "## Task 1.2",
    description = "Rank the individuals by fold change of IgG antibody levels against pertussis toxin (PT) that we detect in plasma 14 days post booster vaccinations compared to titer values at day 0."
  ),
  task_21 = list(
    name = "task_21",
    header = "## Task 2.1",
    description = "Rank the individuals by predicted frequency of Monocytes on day 1 post boost after vaccination."
  ),
  task_22 = list(
    name = "task_22",
    header = "## Task 2.2",
    description = "Rank the individuals by fold change of predicted frequency of Monocytes on day 1 post booster vaccination compared to cell frequency values at day 0."
  ),
  task_31 = list(
    name = "task_31",
    header = "## Task 3.1",
    description = "Rank the individuals by predicted gene expression of CCL3 on day 3 post-booster vaccination."
  ),
  task_32 = list(
    name = "task_32",
    header = "## Task 3.2",
    description = "Rank the individuals by fold change of predicted gene expression of CCL3 on day 3 post booster vaccination compared to gene expression values at day 0."
  ),
  task_41 = list(
    name = "task_41",
    header = "## Task 4.1",
    description = "Rank the individuals based on their Th1/Th2 (IFN-g/IL-5) polarization ratio on Day 30 post-booster vaccination."
  )
)
Code
RENDER <- TRUE

make_flextable <- function(x) {
  if (RENDER) {
    x %>%
      flextable() %>% 
      bg(., bg = "#333333", part = "all") %>%
      color(., color = "white", part = "all") %>%
      set_table_properties(., align = "left") %>%
      flextable_to_rmd(ft) %>%
      return()
  } else {
    return(x)
  }
}

meta_data_covariates <- get_metadata_covariates(meta_data)

for (task in task_meta) {
  #task <- task_meta[[3]]
  
  cat(task$header)
  cat("\n\n")
  cat(task$description)
  cat("\n\n")
  
  cat(paste0("\n\n### ", "metadata model", "\n\n"))
  model_df <- target_list[[task$name]] %>%
    dplyr::left_join(meta_data_covariates, by="subject_id")
  set.seed(42)
  
  get_oob_perf(model_df=model_df) %>% 
    dplyr::mutate(mse = round(mse, 2), r2 = round(r2, 2), srho = round(srho, 2)) %>%
    make_flextable(.)
  
  get_loocv_perf(model_df=model_df) %>% 
    dplyr::mutate(mse = round(mse, 2), r2 = round(r2, 2), srho = round(srho, 2)) %>%
    make_flextable(.)
    
  get_cross_cohort_perf_single_repeated(model_df=model_df, meta_data=meta_data) %>%
    dplyr::mutate(srho_mean = round(srho_mean, 2), srho_sd = round(srho_sd, 2),
                  srho_baseline = round(srho_baseline, 2)) %>%
    make_flextable(.)
  
  cat("\n\n")
  
  for (modality in names(experimental_predictors)) {
    #modality <- names(experimental_predictors)[3]
    
    cat(paste0("\n\n### ", modality, "\n\n"))
    
    modality_data <- experimental_predictors[[modality]] %>%
      as.data.frame() %>% 
      tibble::rownames_to_column("specimen_id") %>%
      dplyr::mutate(specimen_id = as.numeric(specimen_id)) %>%
      dplyr::left_join((specimen_per_day$day_0 %>% dplyr::select(subject_id, specimen_id)),
                       by="specimen_id") %>%
      dplyr::filter(!is.na(subject_id)) %>%
      dplyr::select(-specimen_id)

    
    model_df <- target_list[[task$name]] %>%
      dplyr::left_join(modality_data, by="subject_id")
    
    if (sum(rowMeans(is.na(model_df)) > 0.1) > 0) {
      cat(paste0("\n\nRemoving ", sum(rowMeans(is.na(model_df)) > 0.1),
                 " targets because training data is missing\n\n"))
    }
    
    model_df <- model_df %>%
      tidyr::drop_na()
    
    set.seed(42)
    
    get_oob_perf(model_df=model_df) %>% 
      dplyr::mutate(mse = round(mse, 2), r2 = round(r2, 2), srho = round(srho, 2)) %>%
      make_flextable(.)
    
    get_loocv_perf(model_df=model_df) %>% 
      dplyr::mutate(mse = round(mse, 2), r2 = round(r2, 2), srho = round(srho, 2)) %>%
      make_flextable(.)
      
    # get_cross_cohort_perf_combinations(model_df=model_df, meta_data=meta_data) %>%
    #   dplyr::mutate(mse = round(mse, 2), r2 = round(r2, 2)) %>%
    # make_flextable(.)
      
    # get_cross_cohort_perf_single(model_df=model_df, meta_data=meta_data) %>%
    #   dplyr::mutate(mse = round(mse, 2), r2 = round(r2, 2),
    #                 srho = round(srho, 2), srho_baseline = round(srho_baseline, 2),
    #                 mse_tmean = round(mse_tmean, 2)) %>%
    #   make_flextable(.)
    
    get_cross_cohort_perf_single_repeated(model_df=model_df, meta_data=meta_data) %>%
      dplyr::mutate(srho_mean = round(srho_mean, 2), srho_sd = round(srho_sd, 2),
                    srho_baseline = round(srho_baseline, 2)) %>%
      make_flextable(.)
    
    cat("\n\n")
  }
}

5.1 Task 1.1

Rank the individuals by IgG antibody levels against pertussis toxin (PT) that we detect in plasma 14 days post booster vaccinations.

5.1.1 metadata model

mode

mse

r2

srho

oob

21,930,271

0.81

0.68

mode

mse

r2

srho

loocv

20,912,097

0.82

0.74

trainset

testset

srho_mean

srho_sd

srho_baseline

train_n

test_n

2021

2022

-0.31

0.01

0.44

32

20

2022

2021

-0.34

0.04

0.60

20

32

5.1.2 pbmc_cell_frequency

Removing 3 targets because training data is missing

mode

mse

r2

srho

oob

28,569,686

0.76

0.73

mode

mse

r2

srho

loocv

29,632,541

0.75

0.71

trainset

testset

srho_mean

srho_sd

srho_baseline

train_n

test_n

2021

2022

-0.33

0.04

0.44

29

20

2022

2021

-0.42

0.05

0.59

20

29

5.1.3 pbmc_gene_expression

mode

mse

r2

srho

oob

88,883,136

0.25

0.46

mode

mse

r2

srho

loocv

84,975,227

0.28

0.56

trainset

testset

srho_mean

srho_sd

srho_baseline

train_n

test_n

2021

2022

-0.11

0.10

0.44

32

20

2022

2021

-0.09

0.05

0.60

20

32

5.1.4 plasma_ab_titer

mode

mse

r2

srho

oob

30,973,495

0.74

0.62

mode

mse

r2

srho

loocv

29,795,572

0.75

0.64

trainset

testset

srho_mean

srho_sd

srho_baseline

train_n

test_n

2021

2022

0.09

0.05

0.44

32

20

2022

2021

-0.08

0.05

0.60

20

32

5.1.5 plasma_cytokine_concentration_by_legendplex

Removing 3 targets because training data is missing

mode

mse

r2

srho

oob

16,199,037

0.86

0.76

mode

mse

r2

srho

loocv

16,019,496

0.86

0.74

trainset

testset

srho_mean

srho_sd

srho_baseline

train_n

test_n

2021

2022

0.30

0.06

0.48

31

18

2022

2021

0.03

0.03

0.61

18

31

5.1.7 t_cell_activation

Removing 1 targets because training data is missing

mode

mse

r2

srho

oob

20,527,029

0.83

0.7

mode

mse

r2

srho

loocv

19,009,398

0.84

0.7

trainset

testset

srho_mean

srho_sd

srho_baseline

train_n

test_n

2021

2022

-0.22

0.05

0.44

31

20

2022

2021

-0.13

0.03

0.57

20

31

5.1.8 t_cell_polarization

Removing 12 targets because training data is missing

mode

mse

r2

srho

oob

26,095,972

0.76

0.66

mode

mse

r2

srho

loocv

26,164,472

0.76

0.64

trainset

testset

srho_mean

srho_sd

srho_baseline

train_n

test_n

2021

2022

-0.48

0.07

0.34

25

15

2022

2021

-0.24

0.03

0.62

15

25

5.2 Task 1.2

Rank the individuals by fold change of IgG antibody levels against pertussis toxin (PT) that we detect in plasma 14 days post booster vaccinations compared to titer values at day 0.

5.2.1 metadata model

mode

mse

r2

srho

oob

0.38

0.61

0.81

mode

mse

r2

srho

loocv

0.39

0.6

0.81

trainset

testset

srho_mean

srho_sd

srho_baseline

train_n

test_n

2021

2022

0.36

0.03

-0.89

32

20

2022

2021

0.04

0.02

-0.71

20

32

5.2.2 pbmc_cell_frequency

Removing 3 targets because training data is missing

mode

mse

r2

srho

oob

0.49

0.5

0.74

mode

mse

r2

srho

loocv

0.49

0.5

0.76

trainset

testset

srho_mean

srho_sd

srho_baseline

train_n

test_n

2021

2022

0.14

0.07

-0.89

29

20

2022

2021

0.16

0.04

-0.74

20

29

5.2.3 pbmc_gene_expression

mode

mse

r2

srho

oob

0.93

0.04

0.2

mode

mse

r2

srho

loocv

0.94

0.03

0.22

trainset

testset

srho_mean

srho_sd

srho_baseline

train_n

test_n

2021

2022

0.28

0.05

-0.89

32

20

2022

2021

0.23

0.04

-0.71

20

32

5.2.4 plasma_ab_titer

mode

mse

r2

srho

oob

0.47

0.51

0.78

mode

mse

r2

srho

loocv

0.48

0.5

0.78

trainset

testset

srho_mean

srho_sd

srho_baseline

train_n

test_n

2021

2022

0.33

0.05

-0.89

32

20

2022

2021

0.32

0.06

-0.71

20

32

5.2.5 plasma_cytokine_concentration_by_legendplex

Removing 3 targets because training data is missing

mode

mse

r2

srho

oob

0.55

0.41

0.64

mode

mse

r2

srho

loocv

0.55

0.41

0.65

trainset

testset

srho_mean

srho_sd

srho_baseline

train_n

test_n

2021

2022

-0.01

0.06

-0.89

31

18

2022

2021

-0.20

0.02

-0.69

18

31

5.2.7 t_cell_activation

Removing 1 targets because training data is missing

mode

mse

r2

srho

oob

0.47

0.52

0.74

mode

mse

r2

srho

loocv

0.5

0.49

0.71

trainset

testset

srho_mean

srho_sd

srho_baseline

train_n

test_n

2021

2022

0.03

0.05

-0.89

31

20

2022

2021

-0.01

0.06

-0.72

20

31

5.2.8 t_cell_polarization

Removing 12 targets because training data is missing

mode

mse

r2

srho

oob

0.52

0.47

0.71

mode

mse

r2

srho

loocv

0.51

0.49

0.74

trainset

testset

srho_mean

srho_sd

srho_baseline

train_n

test_n

2021

2022

0.20

0.06

-0.91

25

15

2022

2021

0.29

0.03

-0.70

15

25

5.3 Task 2.1

Rank the individuals by predicted frequency of Monocytes on day 1 post boost after vaccination.

5.3.1 metadata model

mode

mse

r2

srho

oob

64.45

0.34

0.59

mode

mse

r2

srho

loocv

63.73

0.34

0.59

trainset

testset

srho_mean

srho_sd

srho_baseline

train_n

test_n

2020

2021

0.53

0.04

0.88

12

33

2020

2022

0.13

0.06

0.55

12

21

2021

2020

0.72

0.04

0.81

33

12

2021

2022

0.49

0.02

0.55

33

21

2022

2020

0.20

0.04

0.81

21

12

2022

2021

0.60

0.01

0.88

21

33

5.3.2 pbmc_cell_frequency

mode

mse

r2

srho

oob

50.79

0.48

0.7

mode

mse

r2

srho

loocv

50.61

0.48

0.7

trainset

testset

srho_mean

srho_sd

srho_baseline

train_n

test_n

2020

2021

0.81

0.02

0.88

12

33

2020

2022

0.54

0.02

0.55

12

21

2021

2020

0.72

0.02

0.81

33

12

2021

2022

0.58

0.02

0.55

33

21

2022

2020

0.61

0.04

0.81

21

12

2022

2021

0.82

0.01

0.88

21

33

5.3.3 pbmc_gene_expression

Removing 1 targets because training data is missing

mode

mse

r2

srho

oob

84.34

0.14

0.32

mode

mse

r2

srho

loocv

84.4

0.14

0.35

trainset

testset

srho_mean

srho_sd

srho_baseline

train_n

test_n

2020

2021

0.34

0.03

0.88

11

33

2020

2022

0.11

0.05

0.55

11

21

2021

2020

0.67

0.08

0.84

33

11

2021

2022

0.58

0.05

0.55

33

21

2022

2020

0.41

0.03

0.84

21

11

2022

2021

0.49

0.03

0.88

21

33

5.3.4 plasma_ab_titer

Removing 16 targets because training data is missing

mode

mse

r2

srho

oob

75.6

0.23

0.5

mode

mse

r2

srho

loocv

76.96

0.21

0.5

trainset

testset

srho_mean

srho_sd

srho_baseline

train_n

test_n

2021

2022

0.33

0.04

0.55

29

21

2022

2021

0.37

0.05

0.88

21

29

5.3.5 plasma_cytokine_concentration_by_legendplex

Removing 20 targets because training data is missing

mode

mse

r2

srho

oob

63.89

0.34

0.56

mode

mse

r2

srho

loocv

66.66

0.31

0.53

trainset

testset

srho_mean

srho_sd

srho_baseline

train_n

test_n

2021

2022

0.25

0.06

0.53

28

18

2022

2021

0.81

0.02

0.87

18

28

5.3.7 t_cell_activation

Removing 16 targets because training data is missing

mode

mse

r2

srho

oob

58.13

0.4

0.6

mode

mse

r2

srho

loocv

59.5

0.39

0.62

trainset

testset

srho_mean

srho_sd

srho_baseline

train_n

test_n

2021

2022

0.48

0.02

0.60

30

20

2022

2021

0.75

0.01

0.88

20

30

5.3.8 t_cell_polarization

Removing 25 targets because training data is missing

mode

mse

r2

srho

oob

68.76

0.31

0.6

mode

mse

r2

srho

loocv

69.47

0.31

0.58

trainset

testset

srho_mean

srho_sd

srho_baseline

train_n

test_n

2021

2022

0.33

0.03

0.54

25

16

2022

2021

0.72

0.03

0.86

16

25

5.4 Task 2.2

Rank the individuals by fold change of predicted frequency of Monocytes on day 1 post booster vaccination compared to cell frequency values at day 0.

5.4.1 metadata model

mode

mse

r2

srho

oob

0.11

-0.04

-0.04

mode

mse

r2

srho

loocv

0.11

-0.03

0

trainset

testset

srho_mean

srho_sd

srho_baseline

train_n

test_n

2020

2021

-0.11

0.03

-0.22

12

33

2020

2022

-0.31

0.02

-0.21

12

21

2021

2020

0.52

0.03

-0.43

33

12

2021

2022

-0.31

0.04

-0.21

33

21

2022

2020

-0.44

0.04

-0.43

21

12

2022

2021

0.00

0.03

-0.22

21

33

5.4.2 pbmc_cell_frequency

mode

mse

r2

srho

oob

0.11

-0.01

0.1

mode

mse

r2

srho

loocv

0.11

-0.01

0.12

trainset

testset

srho_mean

srho_sd

srho_baseline

train_n

test_n

2020

2021

0.39

0.03

-0.22

12

33

2020

2022

0.13

0.04

-0.21

12

21

2021

2020

0.16

0.06

-0.43

33

12

2021

2022

0.15

0.04

-0.21

33

21

2022

2020

0.18

0.04

-0.43

21

12

2022

2021

0.09

0.02

-0.22

21

33

5.4.3 pbmc_gene_expression

Removing 1 targets because training data is missing

mode

mse

r2

srho

oob

0.12

-0.09

-0.08

mode

mse

r2

srho

loocv

0.12

-0.11

-0.14

trainset

testset

srho_mean

srho_sd

srho_baseline

train_n

test_n

2020

2021

0.05

0.05

-0.22

11

33

2020

2022

-0.36

0.07

-0.21

11

21

2021

2020

0.03

0.11

-0.31

33

11

2021

2022

0.33

0.06

-0.21

33

21

2022

2020

-0.68

0.06

-0.31

21

11

2022

2021

0.09

0.06

-0.22

21

33

5.4.4 plasma_ab_titer

Removing 16 targets because training data is missing

mode

mse

r2

srho

oob

0.08

-0.03

0.19

mode

mse

r2

srho

loocv

0.08

-0.01

0.14

trainset

testset

srho_mean

srho_sd

srho_baseline

train_n

test_n

2021

2022

0.24

0.05

-0.21

29

21

2022

2021

0.11

0.05

-0.25

21

29

5.4.5 plasma_cytokine_concentration_by_legendplex

Removing 20 targets because training data is missing

mode

mse

r2

srho

oob

0.06

0.14

0.41

mode

mse

r2

srho

loocv

0.06

0.16

0.43

trainset

testset

srho_mean

srho_sd

srho_baseline

train_n

test_n

2021

2022

0.05

0.07

-0.27

28

18

2022

2021

0.04

0.05

-0.25

18

28

5.4.7 t_cell_activation

Removing 16 targets because training data is missing

mode

mse

r2

srho

oob

0.08

-0.11

0.09

mode

mse

r2

srho

loocv

0.08

-0.1

0.09

trainset

testset

srho_mean

srho_sd

srho_baseline

train_n

test_n

2021

2022

0.2

0.01

-0.16

30

20

2022

2021

0.0

0.04

-0.23

20

30

5.4.8 t_cell_polarization

Removing 25 targets because training data is missing

mode

mse

r2

srho

oob

0.08

-0.12

0.06

mode

mse

r2

srho

loocv

0.08

-0.12

0.07

trainset

testset

srho_mean

srho_sd

srho_baseline

train_n

test_n

2021

2022

0.00

0.07

-0.29

25

16

2022

2021

0.13

0.03

-0.24

16

25

5.5 Task 3.1

Rank the individuals by predicted gene expression of CCL3 on day 3 post-booster vaccination.

5.5.1 metadata model

mode

mse

r2

srho

oob

1

0.18

0.46

mode

mse

r2

srho

loocv

1

0.18

0.46

trainset

testset

srho_mean

srho_sd

srho_baseline

train_n

test_n

2020

2021

0.35

0.02

0.57

26

36

2020

2022

-0.06

0.04

0.53

26

21

2021

2020

0.01

0.03

0.27

36

26

2021

2022

0.38

0.05

0.53

36

21

2022

2020

-0.01

0.02

0.27

21

26

2022

2021

0.40

0.02

0.57

21

36

5.5.2 pbmc_cell_frequency

Removing 18 targets because training data is missing

mode

mse

r2

srho

oob

1

0.15

0.33

mode

mse

r2

srho

loocv

1.03

0.13

0.32

trainset

testset

srho_mean

srho_sd

srho_baseline

train_n

test_n

2020

2021

0.40

0.03

0.59

11

33

2020

2022

-0.13

0.07

0.53

11

21

2021

2020

0.05

0.08

0.22

33

11

2021

2022

0.12

0.03

0.53

33

21

2022

2020

0.04

0.04

0.22

21

11

2022

2021

-0.03

0.05

0.59

21

33

5.5.3 pbmc_gene_expression

mode

mse

r2

srho

oob

0.96

0.21

0.5

mode

mse

r2

srho

loocv

0.94

0.23

0.51

trainset

testset

srho_mean

srho_sd

srho_baseline

train_n

test_n

2020

2021

0.03

0.04

0.57

26

36

2020

2022

0.31

0.05

0.53

26

21

2021

2020

-0.37

0.04

0.27

36

26

2021

2022

0.51

0.06

0.53

36

21

2022

2020

0.03

0.06

0.27

21

26

2022

2021

0.32

0.03

0.57

21

36

5.5.4 plasma_ab_titer

Removing 30 targets because training data is missing

mode

mse

r2

srho

oob

0.95

0.12

0.33

mode

mse

r2

srho

loocv

0.94

0.13

0.36

trainset

testset

srho_mean

srho_sd

srho_baseline

train_n

test_n

2021

2022

0.10

0.05

0.53

32

21

2022

2021

0.26

0.05

0.60

21

32

5.5.5 plasma_cytokine_concentration_by_legendplex

Removing 34 targets because training data is missing

mode

mse

r2

srho

oob

1.05

0.05

0.23

mode

mse

r2

srho

loocv

1.06

0.04

0.24

trainset

testset

srho_mean

srho_sd

srho_baseline

train_n

test_n

2021

2022

0.22

0.08

0.53

31

18

2022

2021

0.30

0.04

0.60

18

31

5.5.7 t_cell_activation

Removing 30 targets because training data is missing

mode

mse

r2

srho

oob

1.02

0.18

0.46

mode

mse

r2

srho

loocv

1.05

0.15

0.42

trainset

testset

srho_mean

srho_sd

srho_baseline

train_n

test_n

2021

2022

0.54

0.02

0.53

33

20

2022

2021

0.20

0.03

0.62

20

33

5.5.8 t_cell_polarization

Removing 40 targets because training data is missing

mode

mse

r2

srho

oob

1.04

0.17

0.39

mode

mse

r2

srho

loocv

1.03

0.17

0.37

trainset

testset

srho_mean

srho_sd

srho_baseline

train_n

test_n

2021

2022

0.48

0.03

0.46

27

16

2022

2021

0.16

0.04

0.72

16

27

5.6 Task 3.2

Rank the individuals by fold change of predicted gene expression of CCL3 on day 3 post booster vaccination compared to gene expression values at day 0.

5.6.1 metadata model

mode

mse

r2

srho

oob

1.14

0.01

0.1

mode

mse

r2

srho

loocv

1.15

0

0.08

trainset

testset

srho_mean

srho_sd

srho_baseline

train_n

test_n

2020

2021

0.21

0.01

-0.43

26

36

2020

2022

-0.26

0.03

-0.19

26

21

2021

2020

-0.16

0.02

-0.33

36

26

2021

2022

-0.08

0.05

-0.19

36

21

2022

2020

-0.35

0.02

-0.33

21

26

2022

2021

0.16

0.05

-0.43

21

36

5.6.2 pbmc_cell_frequency

Removing 18 targets because training data is missing

mode

mse

r2

srho

oob

0.96

0

0

mode

mse

r2

srho

loocv

0.97

-0.01

-0.05

trainset

testset

srho_mean

srho_sd

srho_baseline

train_n

test_n

2020

2021

0.26

0.03

-0.37

11

33

2020

2022

-0.13

0.04

-0.19

11

21

2021

2020

-0.18

0.10

-0.43

33

11

2021

2022

-0.17

0.04

-0.19

33

21

2022

2020

-0.27

0.03

-0.43

21

11

2022

2021

0.04

0.04

-0.37

21

33

5.6.3 pbmc_gene_expression

mode

mse

r2

srho

oob

1.2

-0.04

0.04

mode

mse

r2

srho

loocv

1.17

-0.02

0.11

trainset

testset

srho_mean

srho_sd

srho_baseline

train_n

test_n

2020

2021

0.11

0.04

-0.43

26

36

2020

2022

-0.28

0.04

-0.19

26

21

2021

2020

0.18

0.05

-0.33

36

26

2021

2022

-0.34

0.05

-0.19

36

21

2022

2020

-0.45

0.05

-0.33

21

26

2022

2021

-0.13

0.05

-0.43

21

36

5.6.4 plasma_ab_titer

Removing 30 targets because training data is missing

mode

mse

r2

srho

oob

1.26

0.09

0.43

mode

mse

r2

srho

loocv

1.22

0.12

0.42

trainset

testset

srho_mean

srho_sd

srho_baseline

train_n

test_n

2021

2022

0.42

0.04

-0.19

32

21

2022

2021

0.49

0.02

-0.52

21

32

5.6.5 plasma_cytokine_concentration_by_legendplex

Removing 34 targets because training data is missing

mode

mse

r2

srho

oob

1.37

0.04

-0.01

mode

mse

r2

srho

loocv

1.36

0.05

0.01

trainset

testset

srho_mean

srho_sd

srho_baseline

train_n

test_n

2021

2022

-0.21

0.04

-0.18

31

18

2022

2021

0.32

0.04

-0.54

18

31

5.6.7 t_cell_activation

Removing 30 targets because training data is missing

mode

mse

r2

srho

oob

1.25

0.08

0.33

mode

mse

r2

srho

loocv

1.3

0.04

0.33

trainset

testset

srho_mean

srho_sd

srho_baseline

train_n

test_n

2021

2022

0.20

0.04

-0.24

33

20

2022

2021

0.39

0.01

-0.45

20

33

5.6.8 t_cell_polarization

Removing 40 targets because training data is missing

mode

mse

r2

srho

oob

1.51

-0.15

0.04

mode

mse

r2

srho

loocv

1.49

-0.13

0.04

trainset

testset

srho_mean

srho_sd

srho_baseline

train_n

test_n

2021

2022

-0.24

0.05

-0.28

27

16

2022

2021

0.09

0.04

-0.37

16

27

5.7 Task 4.1

Rank the individuals based on their Th1/Th2 (IFN-g/IL-5) polarization ratio on Day 30 post-booster vaccination.

5.7.1 metadata model

mode

mse

r2

srho

oob

4.52

0.03

0.21

mode

mse

r2

srho

loocv

4.53

0.03

0.18

trainset

testset

srho_mean

srho_sd

srho_baseline

train_n

test_n

2021

2022

0.47

0.05

0.74

27

16

2022

2021

0.53

0.02

0.55

16

27

5.7.2 pbmc_cell_frequency

Removing 2 targets because training data is missing

mode

mse

r2

srho

oob

4.34

0.08

0.39

mode

mse

r2

srho

loocv

4.25

0.1

0.42

trainset

testset

srho_mean

srho_sd

srho_baseline

train_n

test_n

2021

2022

0.48

0.03

0.74

25

16

2022

2021

0.67

0.01

0.62

16

25

5.7.3 pbmc_gene_expression

mode

mse

r2

srho

oob

4.92

-0.05

0.16

mode

mse

r2

srho

loocv

4.91

-0.05

0.14

trainset

testset

srho_mean

srho_sd

srho_baseline

train_n

test_n

2021

2022

0.24

0.07

0.74

27

16

2022

2021

0.39

0.03

0.55

16

27

5.7.4 plasma_ab_titer

Removing 2 targets because training data is missing

mode

mse

r2

srho

oob

5.22

-0.07

-0.09

mode

mse

r2

srho

loocv

5.3

-0.09

-0.14

trainset

testset

srho_mean

srho_sd

srho_baseline

train_n

test_n

2021

2022

0.44

0.04

0.74

25

16

2022

2021

0.19

0.03

0.56

16

25

5.7.5 plasma_cytokine_concentration_by_legendplex

Removing 5 targets because training data is missing

mode

mse

r2

srho

oob

5.59

-0.1

-0.03

mode

mse

r2

srho

loocv

5.57

-0.09

-0.02

trainset

testset

srho_mean

srho_sd

srho_baseline

train_n

test_n

2021

2022

-0.06

0.05

0.79

24

14

2022

2021

0.53

0.02

0.52

14

24

5.7.7 t_cell_activation

Removing 2 targets because training data is missing

mode

mse

r2

srho

oob

4.45

0.07

0.33

mode

mse

r2

srho

loocv

4.49

0.07

0.28

trainset

testset

srho_mean

srho_sd

srho_baseline

train_n

test_n

2021

2022

0.42

0.05

0.83

26

15

2022

2021

0.55

0.02

0.59

15

26

5.7.8 t_cell_polarization

mode

mse

r2

srho

oob

4.25

0.09

0.26

mode

mse

r2

srho

loocv

4.16

0.11

0.3

trainset

testset

srho_mean

srho_sd

srho_baseline

train_n

test_n

2021

2022

0.08

0.03

0.74

27

16

2022

2021

0.29

0.02

0.55

16

27