Literature Models

Author

Philipp Sven Lars Schäfer

Published

November 28, 2024

1 Packages

Code

suppressPackageStartupMessages({
  library(tidyverse)
  library(DESeq2)
  library(flextable)
  library(ggdark)
  library(magick)
})

knitr::opts_knit$set(output.dir = "./")

source(file.path("..", "src", "read_data.R"))
source(file.path("..", "src", "colors.R"))
source(file.path("..", "src", "generate_targets.R"))
source(file.path("..", "src", "model.R"))

2 Data

Code

input_dir = file.path("..", "data")

Code

meta_data <- read_harmonized_meta_data(input_dir)
gene_meta <- read_gene_meta(input_dir)

experimental_data <- read_raw_experimental_data(input_dir)
experimental_data <- filter_experimental_data(meta_data, experimental_data, gene_meta)

pbmc_cell_frequency | Removed 550 specimens because missing in meta data

plasma_ab_titer | Removed 6931 specimens because missing in meta data

plasma_cytokine_concentration_by_olink | Removed 495 specimens because missing in meta data

pbmc_cell_frequency | Removed 56 features because not in feature subset

plasma_ab_titer | Removed 48 features because not in feature subset

plasma_cytokine_concentration_by_olink | Removed 234 features because not in feature subset

t_cell_polarization | Removed 3 features because not in feature subset

plasma_cytokine_concentration_by_olink | Removed 300 features because qc warning

plasma_ab_titer | Removed 10540 measurements because wrong unit used

plasma_cytokine_concentration_by_olink | Removed 2400 measurements because wrong unit used

plasma_cytokine_concentration_by_legendplex | Removed 8 because specimen is outlier

plasma_cytokine_concentration_by_olink | Removed specimen 750, 760, 824, 833, 894, 903 because fraction of measurements below LOQ > 50%

plasma_ab_titer | Removed specimen 674, 675, 676 because fraction of measurements below LOD > 50%

Code

meta_data <- filter_meta_data(meta_data, experimental_data)
wide_experimental_data <- generate_wide_experimental_data(experimental_data=experimental_data,
                                                 impute="zero")

plasma_cytokine_concentration_by_olink | NA Fraction: 0.00288417166589755 | Imputed with zero imputation

t_cell_activation | NA Fraction: 0.0576923076923077 | Removed samples: 681

t_cell_activation | NA Fraction: 0.0553691275167785 | Imputed with zero imputation

t_cell_polarization | NA Fraction: 0.0134099616858238 | Imputed with zero imputation

Code

celltype_meta <- read_celltype_meta(input_dir)
gene_meta <- read_gene_meta(input_dir)
protein_meta <- read_protein_meta(input_dir)

Code

target_list <- generate_all_targets(
  meta_data=meta_data,
  experimental_data=experimental_data,
  experimental_data_settings=experimental_data_settings,
  gene_meta=gene_meta,
  protein_meta=protein_meta)
str(target_list)

List of 7
 $ task_11: tibble [52 x 3] (S3: tbl_df/tbl/data.frame)
  ..$ subject_id: num [1:52] 61 62 63 64 65 66 67 68 69 70 ...
  ..$ baseline  : num [1:52] 112.8 125 278.7 353.2 83.8 ...
  ..$ target    : num [1:52] 304 1548 1917 558 1838 ...
 $ task_12: tibble [52 x 3] (S3: tbl_df/tbl/data.frame)
  ..$ subject_id: num [1:52] 61 62 63 64 65 66 67 68 69 70 ...
  ..$ baseline  : num [1:52] 112.8 125 278.7 353.2 83.8 ...
  ..$ target    : num [1:52] 0.992 2.517 1.928 0.458 3.088 ...
 $ task_21: tibble [66 x 3] (S3: tbl_df/tbl/data.frame)
  ..$ subject_id: num [1:66] 4 6 15 20 21 26 29 31 33 36 ...
  ..$ baseline  : num [1:66] 5.49 22.79 15.64 21.4 15.23 ...
  ..$ target    : num [1:66] 7.21 41.38 10.59 26.61 34.81 ...
 $ task_22: tibble [66 x 3] (S3: tbl_df/tbl/data.frame)
  ..$ subject_id: num [1:66] 4 6 15 20 21 26 29 31 33 36 ...
  ..$ baseline  : num [1:66] 5.49 22.79 15.64 21.4 15.23 ...
  ..$ target    : num [1:66] 0.273 0.596 -0.39 0.218 0.827 ...
 $ task_31: tibble [83 x 3] (S3: tbl_df/tbl/data.frame)
  ..$ subject_id: num [1:83] 1 3 4 5 6 9 10 13 15 20 ...
  ..$ baseline  : num [1:83] 5.68 5.4 5.04 5.66 5.44 ...
  ..$ target    : num [1:83] 5.91 5.46 4.89 5.23 5.38 ...
 $ task_32: tibble [83 x 3] (S3: tbl_df/tbl/data.frame)
  ..$ subject_id: num [1:83] 1 3 4 5 6 9 10 13 15 20 ...
  ..$ baseline  : num [1:83] 5.68 5.4 5.04 5.66 5.44 ...
  ..$ target    : num [1:83] 0.234 0.0612 -0.1466 -0.4284 -0.0628 ...
 $ task_41: tibble [43 x 3] (S3: tbl_df/tbl/data.frame)
  ..$ subject_id: num [1:43] 61 62 66 67 68 69 70 71 72 73 ...
  ..$ baseline  : num [1:43] -2.933 -1.985 2.286 -0.555 -0.144 ...
  ..$ target    : num [1:43] 2.625 -1.114 2.817 -2.363 -0.466 ...

Code

symbol_to_ensemble <- gene_meta %>%
  dplyr::pull(versioned_ensembl_gene_id_clean, gene_symbol)

gene_set <- read_rds(file.path(input_dir, "external_data",
                               "literature_models_first_challenge-cmipb-challenge", 
                               "CMI-PB-literature_models_first_challenge-c3c23cb",
                               "Study-1-Avey-2017",
                               "gene_set.RDS")) %>%
  purrr::map(function(genes) {
    symbol_to_ensemble[genes]
  })

length(gene_set)

[1] 346

Code

specimen_per_day <- get_specimen_per_day(meta_data)
specimen_per_day$day_0 <- specimen_per_day$day_0 %>%
  dplyr::filter(specimen_id %in% rownames(wide_experimental_data$pbmc_gene_expression)) %>%
  dplyr::mutate(specimen_id = as.character(specimen_id))

pbmc_gex <- wide_experimental_data$pbmc_gene_expression[specimen_per_day$day_0$specimen_id, ]
str(pbmc_gex)

 num [1:136, 1:56436] 27 37 23 13 17 24 9 18 24 19 ...
 - attr(*, "dimnames")=List of 2
  ..$ : chr [1:136] "1" "19" "27" "37" ...
  ..$ : chr [1:56436] "ENSG00000000003_14" "ENSG00000000005_5" "ENSG00000000419_12" "ENSG00000000457_13" ...

Code

# filter genes and normalize the gene expression data
min_counts <- 200
min_samples <- 20

genes_to_keep <- colnames(pbmc_gex)[
  (colSums(pbmc_gex) >= min_counts) &
    (colSums(pbmc_gex > 0) >= min_samples)
]

pbmc_gex <- pbmc_gex[ , genes_to_keep]

str(pbmc_gex)

 num [1:136, 1:23470] 27 37 23 13 17 24 9 18 24 19 ...
 - attr(*, "dimnames")=List of 2
  ..$ : chr [1:136] "1" "19" "27" "37" ...
  ..$ : chr [1:23470] "ENSG00000000003_14" "ENSG00000000419_12" "ENSG00000000457_13" "ENSG00000000460_16" ...

Code

# check that all column (gene) names are unique
stopifnot(length(colnames(pbmc_gex)) == length(unique(colnames(pbmc_gex))))

str(pbmc_gex)

 num [1:136, 1:23470] 27 37 23 13 17 24 9 18 24 19 ...
 - attr(*, "dimnames")=List of 2
  ..$ : chr [1:136] "1" "19" "27" "37" ...
  ..$ : chr [1:23470] "ENSG00000000003_14" "ENSG00000000419_12" "ENSG00000000457_13" "ENSG00000000460_16" ...

Code

# variance stabilizing transformation, log transform or vst from DESEQ2?
# pbmc_gex <- log1p(pbmc_gex)

# Create DESeq dataset object
dds <- DESeq2::DESeqDataSetFromMatrix(countData = t(pbmc_gex),
                                      colData = tibble::tibble(subject_id=rownames(pbmc_gex)),
                                      design = ~ 1)

converting counts to integer mode

Code

# DESeq2 normalization with variance stabilizing transformation (vst)
dds <- DESeq2::estimateSizeFactors(dds) ## estimate size factor
dds <- DESeq2::estimateDispersions(dds)

gene-wise dispersion estimates

mean-dispersion relationship

final dispersion estimates

Code

dds <- DESeq2::varianceStabilizingTransformation(dds, blind=TRUE)
pbmc_gex <- t(assay(dds))
str(pbmc_gex)

 num [1:136, 1:23470] 4.36 4.51 3.9 3.58 3.82 ...
 - attr(*, "dimnames")=List of 2
  ..$ : chr [1:136] "1" "19" "27" "37" ...
  ..$ : chr [1:23470] "ENSG00000000003_14" "ENSG00000000419_12" "ENSG00000000457_13" "ENSG00000000460_16" ...

Code

score_signature_ulm <- function(gex_mtx, signature_genes) {
  n_missing <- sum(!signature_genes %in% colnames(gex_mtx))
  message(paste0(n_missing, " features from signature are missing"))
  signature_genes <- signature_genes[signature_genes %in% colnames(gex_mtx)]
  
  binary_vec <- vector(mode="numeric", length=ncol(gex_mtx))
  names(binary_vec) <- colnames(gex_mtx)
  binary_vec[signature_genes] <- 1
  stopifnot(sum(binary_vec) == length(signature_genes))
  
  purrr::map(rownames(gex_mtx), function(rname) {
    model <- lm(
      formula = y ~ x,
      data = tibble(y = gex_mtx[rname, ],
                    x = binary_vec)
    )
    tval <- summary(model)$coefficients["x", "t value"]
    return(
      tibble(specimen_id = rname, 
             stat = tval)
    )
  }) %>%
    dplyr::bind_rows()
}

score_signature_mean <- function(gex_mtx, signature_genes) {
  n_missing <- sum(!signature_genes %in% colnames(gex_mtx))
  message(paste0(n_missing, " features from signature are missing"))
  signature_genes <- signature_genes[signature_genes %in% colnames(gex_mtx)]
  
  return(
    tibble(
      specimen_id = rownames(gex_mtx),
      stat = rowSums(gex_mtx[, signature_genes])
      )
  )
}

signatures_oi <- c(
  "inflammatory response (M33)",
  "platelet activation (III) (M42)",
  "BCR signaling (M54)",
  "Random_1",
  "Random_2",
  "Random_3"
)

#random_signatures <- sample(names(gene_set), 3)
set.seed(42)
gene_set[["Random_1"]] <- sample(colnames(pbmc_gex), 20)
gene_set[["Random_2"]] <- sample(colnames(pbmc_gex), 20)
gene_set[["Random_3"]] <- sample(colnames(pbmc_gex), 20)

sign_scores <- purrr::map(signatures_oi, function(signature_name) {
  # signature_name <- signatures_oi[1]
  # score_signature_mean(gex_mtx=pbmc_gex, 
  #                     signature=gene_set[[signature_name]]) %>%
  score_signature_ulm(gex_mtx=pbmc_gex,
                      signature=gene_set[[signature_name]]) %>%
    dplyr::mutate(signature = signature_name)
}) %>%
  dplyr::bind_rows() %>%
  tidyr::pivot_wider(names_from="signature", values_from="stat") %>%
  dplyr::mutate(specimen_id = as.numeric(specimen_id)) %>%
  dplyr::left_join(meta_data %>% dplyr::select(subject_id, specimen_id), 
                   by="specimen_id") %>%
  dplyr::select(-specimen_id)

2 features from signature are missing

0 features from signature are missing

1 features from signature are missing

0 features from signature are missing
0 features from signature are missing
0 features from signature are missing

Code

GGally::ggpairs(sign_scores %>% dplyr::select(-c(subject_id))) +
  ggdark::dark_mode(verbose=FALSE)

Registered S3 method overwritten by 'GGally':
  method from   
  +.gg   ggplot2

Code

sign_scores

3 Concusions

I cannot quite reproduce the results from the paper. In the paper the signatures from Avey et al. (2017), are very helpful to predict the fold changes, but this is not quite the case here. Maybe I am not using the best scoring function.

4 Results

Code

task_meta <- list(
  task_11 = list(
    name = "task_11",
    header = "## Task 1.1",
    description = "Rank the individuals by IgG antibody levels against pertussis toxin (PT) that we detect in plasma 14 days post booster vaccinations."
  ),
  task_12 = list(
    name = "task_12",
    header = "## Task 1.2",
    description = "Rank the individuals by fold change of IgG antibody levels against pertussis toxin (PT) that we detect in plasma 14 days post booster vaccinations compared to titer values at day 0."
  )
)

Code

RENDER <- TRUE

make_flextable <- function(x) {
  if (RENDER) {
    x %>%
      flextable() %>% 
      bg(., bg = "#333333", part = "all") %>%
      color(., color = "white", part = "all") %>%
      set_table_properties(., align = "left") %>%
      flextable_to_rmd(ft) %>%
      return()
  } else {
    return(x)
  }
}

for (task in task_meta) {
  # task <- task_meta[[2]]

  cat(task$header)
  cat("\n\n")
  cat(task$description)
  cat("\n\n")
  
  for (signature in signatures_oi) {
    # signature <- signatures_oi[[1]]
    
    cat(paste0("### ", signature))
    cat("\n\n")
    
    model_df <- target_list[[task$name]] %>%
      dplyr::left_join(sign_scores %>% dplyr::select(dplyr::all_of(c("subject_id", signature))), 
                       by="subject_id")
    model_df %>%
      dplyr::left_join((meta_data %>% dplyr::select(subject_id, dataset) %>% dplyr::distinct()),
                       by="subject_id") %>%
      dplyr::group_by(dataset) %>%
      dplyr::summarise(
        "srho_baseline" = round(get_spearman(.data$target, .data$baseline), 2),
        "srho_signature" = round(get_spearman(.data$target, .data[[signature]]), 2),
        "srho_pval_signature" = get_spearman_pval(.data$target, .data[[signature]])
      ) %>%
      make_flextable()
  }
}

4.1 Task 1.1

Rank the individuals by IgG antibody levels against pertussis toxin (PT) that we detect in plasma 14 days post booster vaccinations.

4.1.1 inflammatory response (M33)

dataset	srho_baseline	srho_signature	srho_pval_signature
2021_dataset	0.60	0.06	0.388
2022_dataset	0.44	0.42	0.034

4.1.2 platelet activation (III) (M42)

dataset	srho_baseline	srho_signature	srho_pval_signature
2021_dataset	0.60	0.02	0.441
2022_dataset	0.44	-0.03	0.541

4.1.3 BCR signaling (M54)

dataset	srho_baseline	srho_signature	srho_pval_signature
2021_dataset	0.60	-0.14	0.785
2022_dataset	0.44	-0.08	0.656

4.1.4 Random_1

dataset	srho_baseline	srho_signature	srho_pval_signature
2021_dataset	0.60	0.00	0.513
2022_dataset	0.44	-0.11	0.694

4.1.5 Random_2

dataset	srho_baseline	srho_signature	srho_pval_signature
2021_dataset	0.60	-0.10	0.729
2022_dataset	0.44	-0.36	0.937

4.1.6 Random_3

dataset	srho_baseline	srho_signature	srho_pval_signature
2021_dataset	0.60	0.07	0.318
2022_dataset	0.44	-0.12	0.709

4.2 Task 1.2

Rank the individuals by fold change of IgG antibody levels against pertussis toxin (PT) that we detect in plasma 14 days post booster vaccinations compared to titer values at day 0.

4.2.1 inflammatory response (M33)

dataset	srho_baseline	srho_signature	srho_pval_signature
2021_dataset	-0.71	0.11	0.266
2022_dataset	-0.89	-0.01	0.529

4.2.2 platelet activation (III) (M42)

dataset	srho_baseline	srho_signature	srho_pval_signature
2021_dataset	-0.71	0.03	0.436
2022_dataset	-0.89	0.27	0.143

4.2.3 BCR signaling (M54)

dataset	srho_baseline	srho_signature	srho_pval_signature
2021_dataset	-0.71	-0.19	0.848
2022_dataset	-0.89	0.01	0.472

4.2.4 Random_1

dataset	srho_baseline	srho_signature	srho_pval_signature
2021_dataset	-0.71	0.03	0.453
2022_dataset	-0.89	0.04	0.436

4.2.5 Random_2

dataset	srho_baseline	srho_signature	srho_pval_signature
2021_dataset	-0.71	0.09	0.309
2022_dataset	-0.89	-0.11	0.677

4.2.6 Random_3

dataset	srho_baseline	srho_signature	srho_pval_signature
2021_dataset	-0.71	-0.04	0.570
2022_dataset	-0.89	0.18	0.211

5 Previous Results

From: Shinde, P. et al. Putting computational models of immunity to the test - an invited challenge to predict B. pertussis vaccination outcomes. (2024) doi:10.1101/2024.09.04.611290.

Code

knitr::include_graphics(file.path("..", "img", "results_challenge_2.png"))