Author

Philipp Sven Lars Schäfer

Published

November 28, 2024

1 Packages

Code
suppressPackageStartupMessages({
  library(tidyverse)
  library(ggdark)
  library(vsn)
  library(lme4)
  library(DESeq2)
  library(factoextra)
  library(FactoMineR)
  library(magick) # formatting
  source(file.path("..", "src", "read_data.R"))
  source(file.path("..", "src", "generate_targets.R"))
  source(file.path("..", "src", "model.R"))
  source(file.path("..", "src", "batch_effects.R"))
})
knitr::knit_hooks$set(crop = knitr::hook_pdfcrop) # formatting

2 Data

Code
input_dir = file.path("..", "data")
Code
meta_data <- read_harmonized_meta_data(input_dir)
gene_meta <- read_gene_meta(input_dir)
protein_meta <- read_protein_meta(input_dir)
celltype_meta <- read_celltype_meta(input_dir)

experimental_data <- read_raw_experimental_data(input_dir)
experimental_data <- filter_experimental_data(meta_data=meta_data, 
                                              experimental_data=experimental_data,
                                              gene_meta=gene_meta)
pbmc_cell_frequency | Removed 550 specimens because missing in meta data
plasma_ab_titer | Removed 6931 specimens because missing in meta data
plasma_cytokine_concentration_by_olink | Removed 495 specimens because missing in meta data
pbmc_cell_frequency | Removed 56 features because not in feature subset
plasma_ab_titer | Removed 48 features because not in feature subset
plasma_cytokine_concentration_by_olink | Removed 234 features because not in feature subset
t_cell_polarization | Removed 3 features because not in feature subset
plasma_cytokine_concentration_by_olink | Removed 300 features because qc warning
plasma_ab_titer | Removed 10540 measurements because wrong unit used
plasma_cytokine_concentration_by_olink | Removed 2400 measurements because wrong unit used
plasma_cytokine_concentration_by_legendplex | Removed 8 because specimen is outlier
plasma_cytokine_concentration_by_olink | Removed specimen 750, 760, 824, 833, 894, 903 because fraction of measurements below LOQ > 50%
plasma_ab_titer | Removed specimen 674, 675, 676 because fraction of measurements below LOD > 50%
Code
specimen_per_day <- get_specimen_per_day(meta_data=meta_data)

3 Conclusions

3.1 PBMC Fractions

  • For the baseline specimen (day 0), we see a very strong batch effect, because the 2023 dataset is very different

3.2 PBMC Gene Expression

  • Very strong cohort-specific effects for baseline specimen and post-booster specimen

3.3 Plasma Antibody Levels

3.4 Plasma Cytokine Concentration by Legendplex

  • After removing outlier in the legendplex data…

3.6 T Cell Activation

  • Specimen are only available for baseline, no cohort-specific effect

3.7 T Cell Polarization

  • Specimen are only available for baseline, most of the variance is explained by interaction terms age_at_boost:dataset and infancy_vac:dataset. TODO: Think about what to make of this.

4 Results

Code
##| crop: true # interfers with dynamic toc generation

for (day in names(specimen_per_day)) {
  # day <- names(specimen_per_day)[1]
  
  cat("\n\n")
  cat(paste0("## ", day))
  cat("\n\n")
  
  specimen_df <- specimen_per_day[[day]]
  
  for (assay in names(experimental_data)) {
    # assay <- names(experimental_data)[4]
    
    cat("\n\n")
    cat(paste0("### ", assay))
    cat("\n\n")
    
    feature_col <- experimental_data_settings[[assay]]$feature_col
    value_col <- experimental_data_settings[[assay]]$value_col
    
    assay_df_long <- experimental_data[[assay]]
    #str(assay_df_long)
    
    specimen_df_filtered <- specimen_df %>%
      dplyr::filter(specimen_id %in% unique(assay_df_long$specimen_id))
    
    if (nrow(specimen_df_filtered) <= 3) {
      cat(paste0("Fewer than 3 specimen for this combination of assay and day available"))
    } else {
      assay_mtx_wide <- assay_df_long %>%
        dplyr::select(dplyr::all_of(c("specimen_id", feature_col, value_col))) %>%
        tidyr::pivot_wider(names_from=dplyr::all_of(feature_col), 
                           values_from=dplyr::all_of(value_col)) %>%
        tibble::column_to_rownames(var="specimen_id") %>%
        as.matrix() %>%
        .[as.character(specimen_df_filtered$specimen_id), ]
      
      impute_mode <- "knn"
      stopifnot(impute_mode %in% c("zero", "median_feature", "knn"))
      
      if (any(is.na(assay_mtx_wide))) {
        cat(paste0("Non-zero fraction of NAs: ", 
                   round(mean(is.na(assay_mtx_wide)), 4)))
        # data: matrix with genes in the rows, samples in the columns
        if (impute_mode == "zero") {
          assay_mtx_wide[is.na(assay_mtx_wide)] <- 0
        } else if (impute_mode == "knn") {
          assay_mtx_wide <- t(impute::impute.knn(data=t(assay_mtx_wide))$data)
        } else if (impute_mode == "median_feature") {
          col_medians <- matrixStats::colMedians(assay_mtx_wide, na.rm=TRUE)
          for (colname in colnames(assay_mtx_wide)) {
            col_vec <- assay_mtx_wide[, colname]
            col_vec[is.na(col_vec)] <- col_medians[colname]
            assay_mtx_wide[, colname] <- col_vec
          }
        }
        stopifnot(!any(is.na(assay_mtx_wide)))
      }
      
      if (assay == "pbmc_gene_expression") {
        # normalization using vst from DEseq2
        #str(assay_mtx_wide)
        
        # Create DESeq dataset object
        min_counts <- 200
        min_samples <- 20
      
        genes_to_keep <- colnames(assay_mtx_wide)[
          (colSums(assay_mtx_wide) >= min_counts) &
            (colSums(assay_mtx_wide > 0) >= min_samples)
        ]
        assay_mtx_wide <- assay_mtx_wide[ , genes_to_keep]
        #str(assay_mtx_wide)
        
        # check that all column (gene) names are unique
        stopifnot(length(colnames(assay_mtx_wide)) == length(unique(colnames(assay_mtx_wide))))
        
        assay_mtx_wide <- 
          DESeq2::DESeqDataSetFromMatrix(countData = t(assay_mtx_wide),
                                         colData = tibble::tibble(specimen_id=rownames(assay_mtx_wide)),
                                         design = ~ 1)
        
        # DESeq2 normalization with variance stabilizing transformation (vst)
        assay_mtx_wide <- DESeq2::estimateSizeFactors(assay_mtx_wide, quiet=TRUE) ## estimate size factor
        assay_mtx_wide <- DESeq2::estimateDispersions(assay_mtx_wide, quiet=TRUE)
        assay_mtx_wide <- DESeq2::varianceStabilizingTransformation(assay_mtx_wide, blind=TRUE)
        assay_mtx_wide <- t(assay(assay_mtx_wide))
      }
      
      # dist_mtx <- as.matrix(dist(assay_mtx_wide))
      # dist_mtx_median <- median(dist_mtx[lower.tri(dist_mtx)])
      # dist_mtx_binary <- dist_mtx > (3 * dist_mtx_median)
      # outliers <-
      #  which(rowSums(dist_mtx_binary) > (10*median(rowSums(dist_mtx_binary))))
      # no_outliers <- rownames(dist_mtx)[!rownames(dist_mtx) %in% names(outliers)]
      # assay_mtx_wide <- assay_mtx_wide[no_outliers, ]
      
      #str(assay_mtx_wide)
      suppressMessages({
        suppressWarnings({
          p1 <- pvca_analysis(assay_mtx_wide=assay_mtx_wide,
                              meta_data=meta_data)$p1
        })
      })
      print(p1)
      
      plist <- generate_pca_plots(assay_mtx_wide=assay_mtx_wide,
                                  meta_data=meta_data)
      print(plist$p1)
      print(plist$p2)
      print(plist$p3)
    }
    cat(paste0("\n\n"))
  }
}

4.1 day_0

4.1.1 pbmc_cell_frequency

4.1.2 pbmc_gene_expression

converting counts to integer mode

4.1.3 plasma_ab_titer

4.1.4 plasma_cytokine_concentration_by_legendplex

4.1.6 t_cell_activation

Non-zero fraction of NAs: 0.059

4.1.7 t_cell_polarization

Non-zero fraction of NAs: 0.0106

4.2 day_1

4.2.1 pbmc_cell_frequency

4.2.2 pbmc_gene_expression

converting counts to integer mode

4.2.3 plasma_ab_titer

4.2.4 plasma_cytokine_concentration_by_legendplex

4.2.6 t_cell_activation

Fewer than 3 specimen for this combination of assay and day available

4.2.7 t_cell_polarization

Fewer than 3 specimen for this combination of assay and day available

4.3 day_3

4.3.1 pbmc_cell_frequency

4.3.2 pbmc_gene_expression

converting counts to integer mode

4.3.3 plasma_ab_titer

4.3.4 plasma_cytokine_concentration_by_legendplex

4.3.6 t_cell_activation

Fewer than 3 specimen for this combination of assay and day available

4.3.7 t_cell_polarization

Fewer than 3 specimen for this combination of assay and day available

4.4 day_14

4.4.1 pbmc_cell_frequency

4.4.2 pbmc_gene_expression

converting counts to integer mode

4.4.3 plasma_ab_titer

4.4.4 plasma_cytokine_concentration_by_legendplex

4.4.6 t_cell_activation

Fewer than 3 specimen for this combination of assay and day available

4.4.7 t_cell_polarization

Fewer than 3 specimen for this combination of assay and day available

4.5 day_30

4.5.1 pbmc_cell_frequency

Fewer than 3 specimen for this combination of assay and day available

4.5.2 pbmc_gene_expression

Fewer than 3 specimen for this combination of assay and day available

4.5.3 plasma_ab_titer

4.5.4 plasma_cytokine_concentration_by_legendplex

Fewer than 3 specimen for this combination of assay and day available

4.5.6 t_cell_activation

4.5.7 t_cell_polarization

Non-zero fraction of NAs: 0.0435

5 Appendix

Code
sessionInfo()
R version 4.2.3 (2023-03-15)
Platform: aarch64-apple-darwin20 (64-bit)
Running under: macOS 14.6.1

Matrix products: default
BLAS:   /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/lib/libRblas.0.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/lib/libRlapack.dylib

locale:
[1] C

attached base packages:
[1] stats4    stats     graphics  grDevices datasets  utils     methods  
[8] base     

other attached packages:
 [1] magick_2.8.3                FactoMineR_2.11            
 [3] factoextra_1.0.7            DESeq2_1.38.3              
 [5] SummarizedExperiment_1.28.0 MatrixGenerics_1.10.0      
 [7] matrixStats_1.4.1           GenomicRanges_1.50.2       
 [9] GenomeInfoDb_1.34.9         IRanges_2.32.0             
[11] S4Vectors_0.36.2            lme4_1.1-35.5              
[13] Matrix_1.6-5                vsn_3.66.0                 
[15] Biobase_2.58.0              BiocGenerics_0.44.0        
[17] ggdark_0.2.1                lubridate_1.9.3            
[19] forcats_1.0.0               stringr_1.5.1              
[21] dplyr_1.1.4                 purrr_1.0.2                
[23] readr_2.1.5                 tidyr_1.3.1                
[25] tibble_3.2.1                ggplot2_3.5.1              
[27] tidyverse_2.0.0            

loaded via a namespace (and not attached):
  [1] minqa_1.2.8            colorspace_2.1-1       ggsignif_0.6.4        
  [4] estimability_1.5.1     XVector_0.38.0         ggpubr_0.6.0          
  [7] farver_2.1.2           affyio_1.68.0          DT_0.33               
 [10] ggrepel_0.9.6          bit64_4.5.2            AnnotationDbi_1.60.2  
 [13] fansi_1.0.6            mvtnorm_1.3-1          codetools_0.2-20      
 [16] splines_4.2.3          leaps_3.2              impute_1.72.3         
 [19] cachem_1.1.0           geneplotter_1.76.0     knitr_1.49            
 [22] Formula_1.2-5          jsonlite_1.8.9         nloptr_2.1.1          
 [25] broom_1.0.7            annotate_1.76.0        cluster_2.1.6         
 [28] png_0.1-8              BiocManager_1.30.25    compiler_4.2.3        
 [31] httr_1.4.7             backports_1.5.0        emmeans_1.10.5        
 [34] fastmap_1.2.0          limma_3.54.2           cli_3.6.3             
 [37] htmltools_0.5.8.1      tools_4.2.3            coda_0.19-4.1         
 [40] gtable_0.3.6           glue_1.8.0             GenomeInfoDbData_1.2.9
 [43] affy_1.76.0            Rcpp_1.0.13-1          carData_3.0-5         
 [46] vctrs_0.6.5            Biostrings_2.66.0      preprocessCore_1.60.2 
 [49] nlme_3.1-166           xfun_0.49              timechange_0.3.0      
 [52] lifecycle_1.0.4        renv_1.0.11            rstatix_0.7.2         
 [55] XML_3.99-0.17          zlibbioc_1.44.0        MASS_7.3-60.0.1       
 [58] scales_1.3.0           vroom_1.6.5            hms_1.1.3             
 [61] parallel_4.2.3         RColorBrewer_1.1-3     yaml_2.3.10           
 [64] memoise_2.0.1          stringi_1.8.4          RSQLite_2.3.7         
 [67] boot_1.3-31            BiocParallel_1.32.6    rlang_1.1.4           
 [70] pkgconfig_2.0.3        bitops_1.0-9           evaluate_1.0.1        
 [73] lattice_0.22-6         labeling_0.4.3         htmlwidgets_1.6.4     
 [76] bit_4.5.0              tidyselect_1.2.1       magrittr_2.0.3        
 [79] R6_2.5.1               generics_0.1.3         multcompView_0.1-10   
 [82] DelayedArray_0.24.0    DBI_1.2.3              pillar_1.9.0          
 [85] withr_3.0.2            abind_1.4-8            KEGGREST_1.38.0       
 [88] scatterplot3d_0.3-44   RCurl_1.98-1.16        car_3.1-3             
 [91] crayon_1.5.3           utf8_1.2.4             tzdb_0.4.0            
 [94] rmarkdown_2.29         locfit_1.5-9.10        grid_4.2.3            
 [97] blob_1.2.4             flashClust_1.01-2      digest_0.6.37         
[100] xtable_1.8-4           munsell_0.5.1          viridisLite_0.4.2