Author

Philipp Sven Lars Schäfer

Published

November 28, 2024

1 Packages

Code
suppressPackageStartupMessages({
  library(tidyverse)
  library(DESeq2)
  library(flextable)
  library(ggdark)
  library(magick)
})

knitr::opts_knit$set(output.dir = "./")

source(file.path("..", "src", "read_data.R"))
source(file.path("..", "src", "colors.R"))
source(file.path("..", "src", "generate_targets.R"))
source(file.path("..", "src", "model.R"))

2 Data

Code
input_dir = file.path("..", "data")
Code
meta_data <- read_harmonized_meta_data(input_dir)
gene_meta <- read_gene_meta(input_dir)

experimental_data <- read_raw_experimental_data(input_dir)
experimental_data <- filter_experimental_data(meta_data, experimental_data, gene_meta)
pbmc_cell_frequency | Removed 550 specimens because missing in meta data
plasma_ab_titer | Removed 6931 specimens because missing in meta data
plasma_cytokine_concentration_by_olink | Removed 495 specimens because missing in meta data
pbmc_cell_frequency | Removed 56 features because not in feature subset
plasma_ab_titer | Removed 48 features because not in feature subset
plasma_cytokine_concentration_by_olink | Removed 234 features because not in feature subset
t_cell_polarization | Removed 3 features because not in feature subset
plasma_cytokine_concentration_by_olink | Removed 300 features because qc warning
plasma_ab_titer | Removed 10540 measurements because wrong unit used
plasma_cytokine_concentration_by_olink | Removed 2400 measurements because wrong unit used
plasma_cytokine_concentration_by_legendplex | Removed 8 because specimen is outlier
plasma_cytokine_concentration_by_olink | Removed specimen 750, 760, 824, 833, 894, 903 because fraction of measurements below LOQ > 50%
plasma_ab_titer | Removed specimen 674, 675, 676 because fraction of measurements below LOD > 50%
Code
meta_data <- filter_meta_data(meta_data, experimental_data)
wide_experimental_data <- generate_wide_experimental_data(experimental_data=experimental_data,
                                                 impute="zero")
plasma_cytokine_concentration_by_olink | NA Fraction: 0.00288417166589755 | Imputed with zero imputation
t_cell_activation | NA Fraction: 0.0576923076923077 | Removed samples: 681
t_cell_activation | NA Fraction: 0.0553691275167785 | Imputed with zero imputation
t_cell_polarization | NA Fraction: 0.0134099616858238 | Imputed with zero imputation
Code
celltype_meta <- read_celltype_meta(input_dir)
gene_meta <- read_gene_meta(input_dir)
protein_meta <- read_protein_meta(input_dir)
Code
target_list <- generate_all_targets(
  meta_data=meta_data,
  experimental_data=experimental_data,
  experimental_data_settings=experimental_data_settings,
  gene_meta=gene_meta,
  protein_meta=protein_meta)
str(target_list)
List of 7
 $ task_11: tibble [52 x 3] (S3: tbl_df/tbl/data.frame)
  ..$ subject_id: num [1:52] 61 62 63 64 65 66 67 68 69 70 ...
  ..$ baseline  : num [1:52] 112.8 125 278.7 353.2 83.8 ...
  ..$ target    : num [1:52] 304 1548 1917 558 1838 ...
 $ task_12: tibble [52 x 3] (S3: tbl_df/tbl/data.frame)
  ..$ subject_id: num [1:52] 61 62 63 64 65 66 67 68 69 70 ...
  ..$ baseline  : num [1:52] 112.8 125 278.7 353.2 83.8 ...
  ..$ target    : num [1:52] 0.992 2.517 1.928 0.458 3.088 ...
 $ task_21: tibble [66 x 3] (S3: tbl_df/tbl/data.frame)
  ..$ subject_id: num [1:66] 4 6 15 20 21 26 29 31 33 36 ...
  ..$ baseline  : num [1:66] 5.49 22.79 15.64 21.4 15.23 ...
  ..$ target    : num [1:66] 7.21 41.38 10.59 26.61 34.81 ...
 $ task_22: tibble [66 x 3] (S3: tbl_df/tbl/data.frame)
  ..$ subject_id: num [1:66] 4 6 15 20 21 26 29 31 33 36 ...
  ..$ baseline  : num [1:66] 5.49 22.79 15.64 21.4 15.23 ...
  ..$ target    : num [1:66] 0.273 0.596 -0.39 0.218 0.827 ...
 $ task_31: tibble [83 x 3] (S3: tbl_df/tbl/data.frame)
  ..$ subject_id: num [1:83] 1 3 4 5 6 9 10 13 15 20 ...
  ..$ baseline  : num [1:83] 5.68 5.4 5.04 5.66 5.44 ...
  ..$ target    : num [1:83] 5.91 5.46 4.89 5.23 5.38 ...
 $ task_32: tibble [83 x 3] (S3: tbl_df/tbl/data.frame)
  ..$ subject_id: num [1:83] 1 3 4 5 6 9 10 13 15 20 ...
  ..$ baseline  : num [1:83] 5.68 5.4 5.04 5.66 5.44 ...
  ..$ target    : num [1:83] 0.234 0.0612 -0.1466 -0.4284 -0.0628 ...
 $ task_41: tibble [43 x 3] (S3: tbl_df/tbl/data.frame)
  ..$ subject_id: num [1:43] 61 62 66 67 68 69 70 71 72 73 ...
  ..$ baseline  : num [1:43] -2.933 -1.985 2.286 -0.555 -0.144 ...
  ..$ target    : num [1:43] 2.625 -1.114 2.817 -2.363 -0.466 ...
Code
symbol_to_ensemble <- gene_meta %>%
  dplyr::pull(versioned_ensembl_gene_id_clean, gene_symbol)

gene_set <- read_rds(file.path(input_dir, "external_data",
                               "literature_models_first_challenge-cmipb-challenge", 
                               "CMI-PB-literature_models_first_challenge-c3c23cb",
                               "Study-1-Avey-2017",
                               "gene_set.RDS")) %>%
  purrr::map(function(genes) {
    symbol_to_ensemble[genes]
  })

length(gene_set)
[1] 346
Code
specimen_per_day <- get_specimen_per_day(meta_data)
specimen_per_day$day_0 <- specimen_per_day$day_0 %>%
  dplyr::filter(specimen_id %in% rownames(wide_experimental_data$pbmc_gene_expression)) %>%
  dplyr::mutate(specimen_id = as.character(specimen_id))

pbmc_gex <- wide_experimental_data$pbmc_gene_expression[specimen_per_day$day_0$specimen_id, ]
str(pbmc_gex)
 num [1:136, 1:56436] 27 37 23 13 17 24 9 18 24 19 ...
 - attr(*, "dimnames")=List of 2
  ..$ : chr [1:136] "1" "19" "27" "37" ...
  ..$ : chr [1:56436] "ENSG00000000003_14" "ENSG00000000005_5" "ENSG00000000419_12" "ENSG00000000457_13" ...
Code
# filter genes and normalize the gene expression data
min_counts <- 200
min_samples <- 20

genes_to_keep <- colnames(pbmc_gex)[
  (colSums(pbmc_gex) >= min_counts) &
    (colSums(pbmc_gex > 0) >= min_samples)
]

pbmc_gex <- pbmc_gex[ , genes_to_keep]

str(pbmc_gex)
 num [1:136, 1:23470] 27 37 23 13 17 24 9 18 24 19 ...
 - attr(*, "dimnames")=List of 2
  ..$ : chr [1:136] "1" "19" "27" "37" ...
  ..$ : chr [1:23470] "ENSG00000000003_14" "ENSG00000000419_12" "ENSG00000000457_13" "ENSG00000000460_16" ...
Code
# check that all column (gene) names are unique
stopifnot(length(colnames(pbmc_gex)) == length(unique(colnames(pbmc_gex))))

str(pbmc_gex)
 num [1:136, 1:23470] 27 37 23 13 17 24 9 18 24 19 ...
 - attr(*, "dimnames")=List of 2
  ..$ : chr [1:136] "1" "19" "27" "37" ...
  ..$ : chr [1:23470] "ENSG00000000003_14" "ENSG00000000419_12" "ENSG00000000457_13" "ENSG00000000460_16" ...
Code
# variance stabilizing transformation, log transform or vst from DESEQ2?
# pbmc_gex <- log1p(pbmc_gex)

# Create DESeq dataset object
dds <- DESeq2::DESeqDataSetFromMatrix(countData = t(pbmc_gex),
                                      colData = tibble::tibble(subject_id=rownames(pbmc_gex)),
                                      design = ~ 1)
converting counts to integer mode
Code
# DESeq2 normalization with variance stabilizing transformation (vst)
dds <- DESeq2::estimateSizeFactors(dds) ## estimate size factor
dds <- DESeq2::estimateDispersions(dds)
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
Code
dds <- DESeq2::varianceStabilizingTransformation(dds, blind=TRUE)
pbmc_gex <- t(assay(dds))
str(pbmc_gex)
 num [1:136, 1:23470] 4.36 4.51 3.9 3.58 3.82 ...
 - attr(*, "dimnames")=List of 2
  ..$ : chr [1:136] "1" "19" "27" "37" ...
  ..$ : chr [1:23470] "ENSG00000000003_14" "ENSG00000000419_12" "ENSG00000000457_13" "ENSG00000000460_16" ...
Code
score_signature_ulm <- function(gex_mtx, signature_genes) {
  n_missing <- sum(!signature_genes %in% colnames(gex_mtx))
  message(paste0(n_missing, " features from signature are missing"))
  signature_genes <- signature_genes[signature_genes %in% colnames(gex_mtx)]
  
  binary_vec <- vector(mode="numeric", length=ncol(gex_mtx))
  names(binary_vec) <- colnames(gex_mtx)
  binary_vec[signature_genes] <- 1
  stopifnot(sum(binary_vec) == length(signature_genes))
  
  purrr::map(rownames(gex_mtx), function(rname) {
    model <- lm(
      formula = y ~ x,
      data = tibble(y = gex_mtx[rname, ],
                    x = binary_vec)
    )
    tval <- summary(model)$coefficients["x", "t value"]
    return(
      tibble(specimen_id = rname, 
             stat = tval)
    )
  }) %>%
    dplyr::bind_rows()
}

score_signature_mean <- function(gex_mtx, signature_genes) {
  n_missing <- sum(!signature_genes %in% colnames(gex_mtx))
  message(paste0(n_missing, " features from signature are missing"))
  signature_genes <- signature_genes[signature_genes %in% colnames(gex_mtx)]
  
  return(
    tibble(
      specimen_id = rownames(gex_mtx),
      stat = rowSums(gex_mtx[, signature_genes])
      )
  )
}

signatures_oi <- c(
  "inflammatory response (M33)",
  "platelet activation (III) (M42)",
  "BCR signaling (M54)",
  "Random_1",
  "Random_2",
  "Random_3"
)

#random_signatures <- sample(names(gene_set), 3)
set.seed(42)
gene_set[["Random_1"]] <- sample(colnames(pbmc_gex), 20)
gene_set[["Random_2"]] <- sample(colnames(pbmc_gex), 20)
gene_set[["Random_3"]] <- sample(colnames(pbmc_gex), 20)

sign_scores <- purrr::map(signatures_oi, function(signature_name) {
  # signature_name <- signatures_oi[1]
  # score_signature_mean(gex_mtx=pbmc_gex, 
  #                     signature=gene_set[[signature_name]]) %>%
  score_signature_ulm(gex_mtx=pbmc_gex,
                      signature=gene_set[[signature_name]]) %>%
    dplyr::mutate(signature = signature_name)
}) %>%
  dplyr::bind_rows() %>%
  tidyr::pivot_wider(names_from="signature", values_from="stat") %>%
  dplyr::mutate(specimen_id = as.numeric(specimen_id)) %>%
  dplyr::left_join(meta_data %>% dplyr::select(subject_id, specimen_id), 
                   by="specimen_id") %>%
  dplyr::select(-specimen_id)
2 features from signature are missing
0 features from signature are missing
1 features from signature are missing
0 features from signature are missing
0 features from signature are missing
0 features from signature are missing
Code
GGally::ggpairs(sign_scores %>% dplyr::select(-c(subject_id))) +
  ggdark::dark_mode(verbose=FALSE)
Registered S3 method overwritten by 'GGally':
  method from   
  +.gg   ggplot2

Code
sign_scores

3 Concusions

  • I cannot quite reproduce the results from the paper. In the paper the signatures from Avey et al. (2017), are very helpful to predict the fold changes, but this is not quite the case here. Maybe I am not using the best scoring function.

4 Results

Code
task_meta <- list(
  task_11 = list(
    name = "task_11",
    header = "## Task 1.1",
    description = "Rank the individuals by IgG antibody levels against pertussis toxin (PT) that we detect in plasma 14 days post booster vaccinations."
  ),
  task_12 = list(
    name = "task_12",
    header = "## Task 1.2",
    description = "Rank the individuals by fold change of IgG antibody levels against pertussis toxin (PT) that we detect in plasma 14 days post booster vaccinations compared to titer values at day 0."
  )
)
Code
RENDER <- TRUE

make_flextable <- function(x) {
  if (RENDER) {
    x %>%
      flextable() %>% 
      bg(., bg = "#333333", part = "all") %>%
      color(., color = "white", part = "all") %>%
      set_table_properties(., align = "left") %>%
      flextable_to_rmd(ft) %>%
      return()
  } else {
    return(x)
  }
}

for (task in task_meta) {
  # task <- task_meta[[2]]

  cat(task$header)
  cat("\n\n")
  cat(task$description)
  cat("\n\n")
  
  for (signature in signatures_oi) {
    # signature <- signatures_oi[[1]]
    
    cat(paste0("### ", signature))
    cat("\n\n")
    
    model_df <- target_list[[task$name]] %>%
      dplyr::left_join(sign_scores %>% dplyr::select(dplyr::all_of(c("subject_id", signature))), 
                       by="subject_id")
    model_df %>%
      dplyr::left_join((meta_data %>% dplyr::select(subject_id, dataset) %>% dplyr::distinct()),
                       by="subject_id") %>%
      dplyr::group_by(dataset) %>%
      dplyr::summarise(
        "srho_baseline" = round(get_spearman(.data$target, .data$baseline), 2),
        "srho_signature" = round(get_spearman(.data$target, .data[[signature]]), 2),
        "srho_pval_signature" = get_spearman_pval(.data$target, .data[[signature]])
      ) %>%
      make_flextable()
  }
}

4.1 Task 1.1

Rank the individuals by IgG antibody levels against pertussis toxin (PT) that we detect in plasma 14 days post booster vaccinations.

4.1.1 inflammatory response (M33)

dataset

srho_baseline

srho_signature

srho_pval_signature

2021_dataset

0.60

0.06

0.388

2022_dataset

0.44

0.42

0.034

4.1.2 platelet activation (III) (M42)

dataset

srho_baseline

srho_signature

srho_pval_signature

2021_dataset

0.60

0.02

0.441

2022_dataset

0.44

-0.03

0.541

4.1.3 BCR signaling (M54)

dataset

srho_baseline

srho_signature

srho_pval_signature

2021_dataset

0.60

-0.14

0.785

2022_dataset

0.44

-0.08

0.656

4.1.4 Random_1

dataset

srho_baseline

srho_signature

srho_pval_signature

2021_dataset

0.60

0.00

0.513

2022_dataset

0.44

-0.11

0.694

4.1.5 Random_2

dataset

srho_baseline

srho_signature

srho_pval_signature

2021_dataset

0.60

-0.10

0.729

2022_dataset

0.44

-0.36

0.937

4.1.6 Random_3

dataset

srho_baseline

srho_signature

srho_pval_signature

2021_dataset

0.60

0.07

0.318

2022_dataset

0.44

-0.12

0.709

4.2 Task 1.2

Rank the individuals by fold change of IgG antibody levels against pertussis toxin (PT) that we detect in plasma 14 days post booster vaccinations compared to titer values at day 0.

4.2.1 inflammatory response (M33)

dataset

srho_baseline

srho_signature

srho_pval_signature

2021_dataset

-0.71

0.11

0.266

2022_dataset

-0.89

-0.01

0.529

4.2.2 platelet activation (III) (M42)

dataset

srho_baseline

srho_signature

srho_pval_signature

2021_dataset

-0.71

0.03

0.436

2022_dataset

-0.89

0.27

0.143

4.2.3 BCR signaling (M54)

dataset

srho_baseline

srho_signature

srho_pval_signature

2021_dataset

-0.71

-0.19

0.848

2022_dataset

-0.89

0.01

0.472

4.2.4 Random_1

dataset

srho_baseline

srho_signature

srho_pval_signature

2021_dataset

-0.71

0.03

0.453

2022_dataset

-0.89

0.04

0.436

4.2.5 Random_2

dataset

srho_baseline

srho_signature

srho_pval_signature

2021_dataset

-0.71

0.09

0.309

2022_dataset

-0.89

-0.11

0.677

4.2.6 Random_3

dataset

srho_baseline

srho_signature

srho_pval_signature

2021_dataset

-0.71

-0.04

0.570

2022_dataset

-0.89

0.18

0.211

5 Previous Results

  • From: Shinde, P. et al. Putting computational models of immunity to the test - an invited challenge to predict B. pertussis vaccination outcomes. (2024) doi:10.1101/2024.09.04.611290.
Code
knitr::include_graphics(file.path("..", "img", "results_challenge_2.png"))