Batch Effects After Normalization

Author

Philipp Sven Lars Schäfer

Published

November 28, 2024

1 Packages

Code

suppressPackageStartupMessages({
  library(tidyverse)
  library(ggdark)
  library(vsn)
  library(lme4)
  library(DESeq2)
  library(factoextra)
  library(FactoMineR)
  library(magick) # formatting
  source(file.path("..", "src", "read_data.R"))
  source(file.path("..", "src", "generate_targets.R"))
  source(file.path("..", "src", "model.R"))
  source(file.path("..", "src", "batch_effects.R"))
  source(file.path("..", "src", "normalize_integrate.R"))
})
knitr::knit_hooks$set(crop = knitr::hook_pdfcrop) # formatting

2 Data

Code

input_dir = file.path("..", "data")

Code

celltype_meta <- read_celltype_meta(input_dir)
gene_meta <- read_gene_meta(input_dir)
protein_meta <- read_protein_meta(input_dir)

meta_data <- read_harmonized_meta_data(input_dir)

RECOMPUTE <- TRUE
if (RECOMPUTE) {
  experimental_data <- read_raw_experimental_data(input_dir)
  experimental_data <- filter_experimental_data(meta_data=meta_data, 
                                                experimental_data=experimental_data,
                                                gene_meta=gene_meta)
  experimental_data <- normalize_experimental_data(meta_data=meta_data, 
                                                   raw_experimental_data=experimental_data,
                                                   gene_meta=gene_meta)
  write_rds(experimental_data, 
            file = file.path(input_dir, "prc_datasets", 
                             "normalized_experimental_data.RDS"))
} else {
  experimental_data <- read_rds(file = file.path(input_dir, "prc_datasets", 
                                                 "normalized_experimental_data.RDS"))
}

pbmc_cell_frequency | Removed 550 specimens because missing in meta data

plasma_ab_titer | Removed 6931 specimens because missing in meta data

plasma_cytokine_concentration_by_olink | Removed 495 specimens because missing in meta data

pbmc_cell_frequency | Removed 56 features because not in feature subset

plasma_ab_titer | Removed 48 features because not in feature subset

plasma_cytokine_concentration_by_olink | Removed 234 features because not in feature subset

t_cell_polarization | Removed 3 features because not in feature subset

plasma_cytokine_concentration_by_olink | Removed 300 features because qc warning

plasma_ab_titer | Removed 10540 measurements because wrong unit used

plasma_cytokine_concentration_by_olink | Removed 2400 measurements because wrong unit used

plasma_cytokine_concentration_by_legendplex | Removed 8 because specimen is outlier

plasma_cytokine_concentration_by_olink | Removed specimen 750, 760, 824, 833, 894, 903 because fraction of measurements below LOQ > 50%

plasma_ab_titer | Removed specimen 674, 675, 676 because fraction of measurements below LOD > 50%

converting counts to integer mode

Code

experimental_data <- experimental_data[-which(names(experimental_data) ==
                                                "pbmc_gene_expression_counts")]
specimen_per_day <- get_specimen_per_day(meta_data=meta_data)

3 Conclusions

3.1 PBMC Fractions

No further processing needed I guess.

3.2 PBMC Gene Expression

Very strong cohort-specific effects for baseline specimen and post-booster specimen

3.3 Plasma Antibody Levels

Slight batch effect. Not sure whether it needs some kind of batch correction or not.

3.4 Plasma Cytokine Concentration by Legendplex

Slight batch effect. Not sure whether it needs some kind of batch correction or not. The data distribution looks odd anyway!

3.5 Plasma Cytokine Concentration by Olink

No further processing needed I guess.

3.6 T Cell Activation

No further processing needed I guess.

3.7 T Cell Polarization

Specimen are only available for baseline, most of the variance is explained by interaction terms age_at_boost:dataset and infancy_vac:dataset. TODO: Think about what to make of this.

4 Results

Code

##| crop: true # interfers with dynamic toc generation

for (day in names(specimen_per_day)) {
  # day <- names(specimen_per_day)[1]
  
  cat("\n\n")
  cat(paste0("## ", day))
  cat("\n\n")
  
  specimen_df <- specimen_per_day[[day]]
  
  for (assay in names(experimental_data)) {
    # assay <- names(experimental_data)[4]
    
    cat("\n\n")
    cat(paste0("### ", assay))
    cat("\n\n")
    
    feature_col <- experimental_data_settings[[assay]]$feature_col
    value_col <- experimental_data_settings[[assay]]$value_col
    
    assay_df_long <- experimental_data[[assay]]
    #str(assay_df_long)
    
    specimen_df_filtered <- specimen_df %>%
      dplyr::filter(specimen_id %in% unique(assay_df_long$specimen_id))
    
    if (nrow(specimen_df_filtered) <= 3) {
      cat(paste0("Fewer than 3 specimen for this combination of assay and day available"))
    } else {
      assay_mtx_wide <- assay_df_long %>%
        dplyr::select(dplyr::all_of(c("specimen_id", feature_col, value_col))) %>%
        tidyr::pivot_wider(names_from=dplyr::all_of(feature_col), 
                           values_from=dplyr::all_of(value_col)) %>%
        tibble::column_to_rownames(var="specimen_id") %>%
        as.matrix() %>%
        .[as.character(specimen_df_filtered$specimen_id), ]
      
      impute_mode <- "knn"
      stopifnot(impute_mode %in% c("zero", "median", "knn"))
      
      if (any(is.na(assay_mtx_wide))) {
        cat(paste0("Non-zero fraction of NAs: ", 
                   round(mean(is.na(assay_mtx_wide)), 4)))
        # data: matrix with genes in the rows, samples in the columns
        if (impute_mode == "zero") {
          assay_mtx_wide[is.na(assay_mtx_wide)] <- 0
        } else if (impute_mode == "knn") {
          assay_mtx_wide <- t(impute::impute.knn(data=t(assay_mtx_wide))$data)
        } else if (impute_mode == "median") {
          col_medians <- matrixStats::colMedians(assay_mtx_wide, na.rm=TRUE)
          for (colname in colnames(assay_mtx_wide)) {
            col_vec <- assay_mtx_wide[, colname]
            col_vec[is.na(col_vec)] <- col_medians[colname]
            assay_mtx_wide[, colname] <- col_vec
          }
        }
        stopifnot(!any(is.na(assay_mtx_wide)))
      }
      
      # dist_mtx <- as.matrix(dist(assay_mtx_wide))
      # dist_mtx_median <- median(dist_mtx[lower.tri(dist_mtx)])
      # dist_mtx_binary <- dist_mtx > (3 * dist_mtx_median)
      # outliers <-
      #  which(rowSums(dist_mtx_binary) > (10*median(rowSums(dist_mtx_binary))))
      # no_outliers <- rownames(dist_mtx)[!rownames(dist_mtx) %in% names(outliers)]
      # assay_mtx_wide <- assay_mtx_wide[no_outliers, ]
      
      #str(assay_mtx_wide)
      suppressMessages({
        suppressWarnings({
          p1 <- pvca_analysis(assay_mtx_wide=assay_mtx_wide,
                              meta_data=meta_data)$p1
        })
      })
      print(p1)
      
      plist <- generate_pca_plots(assay_mtx_wide=assay_mtx_wide,
                                  meta_data=meta_data)
      print(plist$p1)
      print(plist$p2)
      print(plist$p3)
    }
    cat(paste0("\n\n"))
  }
}

4.1 day_0

4.1.1 pbmc_cell_frequency

4.1.2 pbmc_gene_expression

4.1.3 plasma_ab_titer

4.1.4 plasma_cytokine_concentration_by_legendplex

4.1.5 plasma_cytokine_concentration_by_olink

Non-zero fraction of NAs: 0.0039

4.1.6 t_cell_activation

Non-zero fraction of NAs: 0.059

4.1.7 t_cell_polarization

Non-zero fraction of NAs: 0.0106

4.2 day_1

4.2.1 pbmc_cell_frequency

4.2.2 pbmc_gene_expression

4.2.3 plasma_ab_titer

4.2.4 plasma_cytokine_concentration_by_legendplex

4.2.5 plasma_cytokine_concentration_by_olink

Non-zero fraction of NAs: 0.0038

4.2.6 t_cell_activation

Fewer than 3 specimen for this combination of assay and day available

4.2.7 t_cell_polarization

Fewer than 3 specimen for this combination of assay and day available

4.3 day_3

4.3.1 pbmc_cell_frequency

4.3.2 pbmc_gene_expression

4.3.3 plasma_ab_titer

4.3.4 plasma_cytokine_concentration_by_legendplex

4.3.5 plasma_cytokine_concentration_by_olink

Non-zero fraction of NAs: 8e-04

4.3.6 t_cell_activation

Fewer than 3 specimen for this combination of assay and day available

4.3.7 t_cell_polarization

Fewer than 3 specimen for this combination of assay and day available

4.4 day_14

4.4.1 pbmc_cell_frequency

4.4.2 pbmc_gene_expression

4.4.3 plasma_ab_titer

4.4.4 plasma_cytokine_concentration_by_legendplex

4.4.5 plasma_cytokine_concentration_by_olink

Non-zero fraction of NAs: 0.0025

4.4.6 t_cell_activation

Fewer than 3 specimen for this combination of assay and day available

4.4.7 t_cell_polarization

Fewer than 3 specimen for this combination of assay and day available

4.5 day_30

4.5.1 pbmc_cell_frequency

Fewer than 3 specimen for this combination of assay and day available

4.5.2 pbmc_gene_expression

Fewer than 3 specimen for this combination of assay and day available

4.5.3 plasma_ab_titer

4.5.4 plasma_cytokine_concentration_by_legendplex

Fewer than 3 specimen for this combination of assay and day available

4.5.5 plasma_cytokine_concentration_by_olink

Fewer than 3 specimen for this combination of assay and day available

4.5.6 t_cell_activation

4.5.7 t_cell_polarization

Non-zero fraction of NAs: 0.0435

5 Appendix

Code

sessionInfo()

R version 4.2.3 (2023-03-15)
Platform: aarch64-apple-darwin20 (64-bit)
Running under: macOS 14.6.1

Matrix products: default
BLAS:   /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/lib/libRblas.0.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/lib/libRlapack.dylib

locale:
[1] C

attached base packages:
[1] stats4    stats     graphics  grDevices datasets  utils     methods  
[8] base     

other attached packages:
 [1] sva_3.46.0                  BiocParallel_1.32.6        
 [3] genefilter_1.80.3           mgcv_1.9-1                 
 [5] nlme_3.1-166                magick_2.8.3               
 [7] FactoMineR_2.11             factoextra_1.0.7           
 [9] DESeq2_1.38.3               SummarizedExperiment_1.28.0
[11] MatrixGenerics_1.10.0       matrixStats_1.4.1          
[13] GenomicRanges_1.50.2        GenomeInfoDb_1.34.9        
[15] IRanges_2.32.0              S4Vectors_0.36.2           
[17] lme4_1.1-35.5               Matrix_1.6-5               
[19] vsn_3.66.0                  Biobase_2.58.0             
[21] BiocGenerics_0.44.0         ggdark_0.2.1               
[23] lubridate_1.9.3             forcats_1.0.0              
[25] stringr_1.5.1               dplyr_1.1.4                
[27] purrr_1.0.2                 readr_2.1.5                
[29] tidyr_1.3.1                 tibble_3.2.1               
[31] ggplot2_3.5.1               tidyverse_2.0.0            

loaded via a namespace (and not attached):
  [1] minqa_1.2.8            colorspace_2.1-1       ggsignif_0.6.4        
  [4] estimability_1.5.1     XVector_0.38.0         ggpubr_0.6.0          
  [7] farver_2.1.2           affyio_1.68.0          DT_0.33               
 [10] ggrepel_0.9.6          bit64_4.5.2            AnnotationDbi_1.60.2  
 [13] fansi_1.0.6            mvtnorm_1.3-1          codetools_0.2-20      
 [16] splines_4.2.3          leaps_3.2              impute_1.72.3         
 [19] cachem_1.1.0           geneplotter_1.76.0     knitr_1.49            
 [22] Formula_1.2-5          jsonlite_1.8.9         nloptr_2.1.1          
 [25] broom_1.0.7            annotate_1.76.0        cluster_2.1.6         
 [28] png_0.1-8              BiocManager_1.30.25    compiler_4.2.3        
 [31] httr_1.4.7             backports_1.5.0        emmeans_1.10.5        
 [34] fastmap_1.2.0          limma_3.54.2           cli_3.6.3             
 [37] htmltools_0.5.8.1      tools_4.2.3            coda_0.19-4.1         
 [40] gtable_0.3.6           glue_1.8.0             GenomeInfoDbData_1.2.9
 [43] affy_1.76.0            Rcpp_1.0.13-1          carData_3.0-5         
 [46] vctrs_0.6.5            Biostrings_2.66.0      preprocessCore_1.60.2 
 [49] xfun_0.49              timechange_0.3.0       lifecycle_1.0.4       
 [52] renv_1.0.11            rstatix_0.7.2          XML_3.99-0.17         
 [55] edgeR_3.40.2           zlibbioc_1.44.0        MASS_7.3-60.0.1       
 [58] scales_1.3.0           vroom_1.6.5            hms_1.1.3             
 [61] parallel_4.2.3         RColorBrewer_1.1-3     yaml_2.3.10           
 [64] memoise_2.0.1          stringi_1.8.4          RSQLite_2.3.7         
 [67] boot_1.3-31            rlang_1.1.4            pkgconfig_2.0.3       
 [70] bitops_1.0-9           evaluate_1.0.1         lattice_0.22-6        
 [73] labeling_0.4.3         htmlwidgets_1.6.4      bit_4.5.0             
 [76] tidyselect_1.2.1       magrittr_2.0.3         R6_2.5.1              
 [79] generics_0.1.3         multcompView_0.1-10    DelayedArray_0.24.0   
 [82] DBI_1.2.3              pillar_1.9.0           withr_3.0.2           
 [85] abind_1.4-8            survival_3.7-0         KEGGREST_1.38.0       
 [88] scatterplot3d_0.3-44   RCurl_1.98-1.16        car_3.1-3             
 [91] crayon_1.5.3           utf8_1.2.4             tzdb_0.4.0            
 [94] rmarkdown_2.29         locfit_1.5-9.10        grid_4.2.3            
 [97] blob_1.2.4             flashClust_1.01-2      digest_0.6.37         
[100] xtable_1.8-4           munsell_0.5.1          viridisLite_0.4.2