Batch Effects After Integrations

Author

Philipp Sven Lars Schäfer

Published

November 28, 2024

1 Packages

Code
suppressPackageStartupMessages({
  library(tidyverse)
  library(ggdark)
  library(vsn)
  library(lme4)
  library(DESeq2)
  library(factoextra)
  library(FactoMineR)
  library(magick) # formatting
  source(file.path("..", "src", "read_data.R"))
  source(file.path("..", "src", "generate_targets.R"))
  source(file.path("..", "src", "model.R"))
  source(file.path("..", "src", "batch_effects.R"))
  source(file.path("..", "src", "normalize_integrate.R"))
})
knitr::knit_hooks$set(crop = knitr::hook_pdfcrop) # formatting

2 Data

Code
input_dir = file.path("..", "data")
Code
celltype_meta <- read_celltype_meta(input_dir)
gene_meta <- read_gene_meta(input_dir)
protein_meta <- read_protein_meta(input_dir)

meta_data <- read_harmonized_meta_data(input_dir)

RECOMPUTE <- TRUE
if (RECOMPUTE) {
  experimental_data <- read_raw_experimental_data(input_dir)
  experimental_data <- filter_experimental_data(meta_data=meta_data, 
                                                experimental_data=experimental_data, 
                                                gene_meta=gene_meta)
  experimental_data <- normalize_experimental_data(meta_data=meta_data, 
                                                   raw_experimental_data=experimental_data,
                                                   gene_meta=gene_meta)
  experimental_data <- integrate_experimental_data(meta_data=meta_data, 
                                                   normalized_experimental_data=experimental_data)
  write_rds(experimental_data, 
            file = file.path(input_dir, "prc_datasets", 
                             "integrated_experimental_data.RDS"))
} else {
  experimental_data <- read_rds(file = file.path(input_dir, "prc_datasets", 
                                                 "integrated_experimental_data.RDS"))
}
pbmc_cell_frequency | Removed 550 specimens because missing in meta data
plasma_ab_titer | Removed 6931 specimens because missing in meta data
plasma_cytokine_concentration_by_olink | Removed 495 specimens because missing in meta data
pbmc_cell_frequency | Removed 56 features because not in feature subset
plasma_ab_titer | Removed 48 features because not in feature subset
plasma_cytokine_concentration_by_olink | Removed 234 features because not in feature subset
t_cell_polarization | Removed 3 features because not in feature subset
plasma_cytokine_concentration_by_olink | Removed 300 features because qc warning
plasma_ab_titer | Removed 10540 measurements because wrong unit used
plasma_cytokine_concentration_by_olink | Removed 2400 measurements because wrong unit used
plasma_cytokine_concentration_by_legendplex | Removed 8 because specimen is outlier
plasma_cytokine_concentration_by_olink | Removed specimen 750, 760, 824, 833, 894, 903 because fraction of measurements below LOQ > 50%
plasma_ab_titer | Removed specimen 674, 675, 676 because fraction of measurements below LOD > 50%
converting counts to integer mode
Found 4 batches
Using null model in ComBat-seq.
Adjusting for 0 covariate(s) or covariate level(s)
Estimating dispersions
Fitting the GLM model
Shrinkage off - using GLM estimates for parameters
Adjusting the data
Found3batches
Adjusting for0covariate(s) or covariate level(s)
Standardizing Data across genes
Fitting L/S model and finding priors
Finding parametric adjustments
Adjusting the Data
Code
experimental_data <- experimental_data[-which(names(experimental_data) ==
                                                "pbmc_gene_expression_counts")]
specimen_per_day <- get_specimen_per_day(meta_data=meta_data)

3 Conclusions

3.1 PBMC Fractions

3.2 PBMC Gene Expression

  • Even after running ComBat-seq there is still a substantial batch effect.

  • However, here https://rpubs.com/pshinde/Alldata_batcheffect_correction_3rd_challenge, there is not batch effect, I am not sure why that is.

  • Maybe it is because I am batch-correcting the data for each day relative to boost?

3.3 Plasma Antibody Levels

3.4 Plasma Cytokine Concentration by Legendplex

3.6 T Cell Activation

  • No further processing needed I guess.

3.7 T Cell Polarization

  • Specimen are only available for baseline, most of the variance is explained by interaction terms age_at_boost:dataset and infancy_vac:dataset. TODO: Think about what to make of this.

4 Results

Code
##| crop: true # interfers with dynamic toc generation

for (day in names(specimen_per_day)) {
  # day <- names(specimen_per_day)[1]
  
  cat("\n\n")
  cat(paste0("## ", day))
  cat("\n\n")
  
  specimen_df <- specimen_per_day[[day]]
  
  for (assay in names(experimental_data)) {
    # assay <- names(experimental_data)[4]
    
    cat("\n\n")
    cat(paste0("### ", assay))
    cat("\n\n")
    
    feature_col <- experimental_data_settings[[assay]]$feature_col
    value_col <- experimental_data_settings[[assay]]$value_col
    
    assay_df_long <- experimental_data[[assay]]
    #str(assay_df_long)
    
    specimen_df_filtered <- specimen_df %>%
      dplyr::filter(specimen_id %in% unique(assay_df_long$specimen_id))
    
    if (nrow(specimen_df_filtered) <= 3) {
      cat(paste0("Fewer than 3 specimen for this combination of assay and day available"))
    } else {
      assay_mtx_wide <- assay_df_long %>%
        dplyr::select(dplyr::all_of(c("specimen_id", feature_col, value_col))) %>%
        tidyr::pivot_wider(names_from=dplyr::all_of(feature_col), 
                           values_from=dplyr::all_of(value_col)) %>%
        tibble::column_to_rownames(var="specimen_id") %>%
        as.matrix() %>%
        .[as.character(specimen_df_filtered$specimen_id), ]
      
      impute_mode <- "knn"
      stopifnot(impute_mode %in% c("zero", "median_feature", "knn"))
      
      if (any(is.na(assay_mtx_wide))) {
        cat(paste0("Non-zero fraction of NAs: ", 
                   round(mean(is.na(assay_mtx_wide)), 4)))
        # data: matrix with genes in the rows, samples in the columns
        if (impute_mode == "zero") {
          assay_mtx_wide[is.na(assay_mtx_wide)] <- 0
        } else if (impute_mode == "knn") {
          assay_mtx_wide <- t(impute::impute.knn(data=t(assay_mtx_wide))$data)
        } else if (impute_mode == "median_feature") {
          col_medians <- matrixStats::colMedians(assay_mtx_wide, na.rm=TRUE)
          for (colname in colnames(assay_mtx_wide)) {
            col_vec <- assay_mtx_wide[, colname]
            col_vec[is.na(col_vec)] <- col_medians[colname]
            assay_mtx_wide[, colname] <- col_vec
          }
        }
        stopifnot(!any(is.na(assay_mtx_wide)))
      }
      
      # dist_mtx <- as.matrix(dist(assay_mtx_wide))
      # dist_mtx_median <- median(dist_mtx[lower.tri(dist_mtx)])
      # dist_mtx_binary <- dist_mtx > (3 * dist_mtx_median)
      # outliers <-
      #  which(rowSums(dist_mtx_binary) > (10*median(rowSums(dist_mtx_binary))))
      # no_outliers <- rownames(dist_mtx)[!rownames(dist_mtx) %in% names(outliers)]
      # assay_mtx_wide <- assay_mtx_wide[no_outliers, ]
      
      #str(assay_mtx_wide)
      suppressMessages({
        suppressWarnings({
          p1 <- pvca_analysis(assay_mtx_wide=assay_mtx_wide,
                              meta_data=meta_data)$p1
        })
      })
      print(p1)
      
      plist <- generate_pca_plots(assay_mtx_wide=assay_mtx_wide,
                                  meta_data=meta_data)
      print(plist$p1)
      print(plist$p2)
      print(plist$p3)
    }
    cat(paste0("\n\n"))
  }
}

4.1 day_0

4.1.1 pbmc_cell_frequency

4.1.2 pbmc_gene_expression

4.1.3 plasma_ab_titer

4.1.4 plasma_cytokine_concentration_by_legendplex

4.1.6 t_cell_activation

Non-zero fraction of NAs: 0.059

4.1.7 t_cell_polarization

Non-zero fraction of NAs: 0.0106

4.2 day_1

4.2.1 pbmc_cell_frequency

4.2.2 pbmc_gene_expression

4.2.3 plasma_ab_titer

4.2.4 plasma_cytokine_concentration_by_legendplex

4.2.6 t_cell_activation

Fewer than 3 specimen for this combination of assay and day available

4.2.7 t_cell_polarization

Fewer than 3 specimen for this combination of assay and day available

4.3 day_3

4.3.1 pbmc_cell_frequency

4.3.2 pbmc_gene_expression

4.3.3 plasma_ab_titer

4.3.4 plasma_cytokine_concentration_by_legendplex

4.3.6 t_cell_activation

Fewer than 3 specimen for this combination of assay and day available

4.3.7 t_cell_polarization

Fewer than 3 specimen for this combination of assay and day available

4.4 day_14

4.4.1 pbmc_cell_frequency

4.4.2 pbmc_gene_expression

4.4.3 plasma_ab_titer

4.4.4 plasma_cytokine_concentration_by_legendplex

4.4.6 t_cell_activation

Fewer than 3 specimen for this combination of assay and day available

4.4.7 t_cell_polarization

Fewer than 3 specimen for this combination of assay and day available

4.5 day_30

4.5.1 pbmc_cell_frequency

Fewer than 3 specimen for this combination of assay and day available

4.5.2 pbmc_gene_expression

Fewer than 3 specimen for this combination of assay and day available

4.5.3 plasma_ab_titer

4.5.4 plasma_cytokine_concentration_by_legendplex

Fewer than 3 specimen for this combination of assay and day available

4.5.6 t_cell_activation

4.5.7 t_cell_polarization

Non-zero fraction of NAs: 0.0435

5 Appendix

Code
sessionInfo()
R version 4.2.3 (2023-03-15)
Platform: aarch64-apple-darwin20 (64-bit)
Running under: macOS 14.6.1

Matrix products: default
BLAS:   /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/lib/libRblas.0.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/lib/libRlapack.dylib

locale:
[1] C

attached base packages:
[1] stats4    stats     graphics  grDevices datasets  utils     methods  
[8] base     

other attached packages:
 [1] sva_3.46.0                  BiocParallel_1.32.6        
 [3] genefilter_1.80.3           mgcv_1.9-1                 
 [5] nlme_3.1-166                magick_2.8.3               
 [7] FactoMineR_2.11             factoextra_1.0.7           
 [9] DESeq2_1.38.3               SummarizedExperiment_1.28.0
[11] MatrixGenerics_1.10.0       matrixStats_1.4.1          
[13] GenomicRanges_1.50.2        GenomeInfoDb_1.34.9        
[15] IRanges_2.32.0              S4Vectors_0.36.2           
[17] lme4_1.1-35.5               Matrix_1.6-5               
[19] vsn_3.66.0                  Biobase_2.58.0             
[21] BiocGenerics_0.44.0         ggdark_0.2.1               
[23] lubridate_1.9.3             forcats_1.0.0              
[25] stringr_1.5.1               dplyr_1.1.4                
[27] purrr_1.0.2                 readr_2.1.5                
[29] tidyr_1.3.1                 tibble_3.2.1               
[31] ggplot2_3.5.1               tidyverse_2.0.0            

loaded via a namespace (and not attached):
  [1] minqa_1.2.8            colorspace_2.1-1       ggsignif_0.6.4        
  [4] estimability_1.5.1     XVector_0.38.0         ggpubr_0.6.0          
  [7] farver_2.1.2           affyio_1.68.0          DT_0.33               
 [10] ggrepel_0.9.6          bit64_4.5.2            AnnotationDbi_1.60.2  
 [13] fansi_1.0.6            mvtnorm_1.3-1          codetools_0.2-20      
 [16] splines_4.2.3          leaps_3.2              impute_1.72.3         
 [19] cachem_1.1.0           geneplotter_1.76.0     knitr_1.49            
 [22] Formula_1.2-5          jsonlite_1.8.9         nloptr_2.1.1          
 [25] broom_1.0.7            annotate_1.76.0        cluster_2.1.6         
 [28] png_0.1-8              BiocManager_1.30.25    compiler_4.2.3        
 [31] httr_1.4.7             backports_1.5.0        emmeans_1.10.5        
 [34] fastmap_1.2.0          limma_3.54.2           cli_3.6.3             
 [37] htmltools_0.5.8.1      tools_4.2.3            coda_0.19-4.1         
 [40] gtable_0.3.6           glue_1.8.0             GenomeInfoDbData_1.2.9
 [43] affy_1.76.0            Rcpp_1.0.13-1          carData_3.0-5         
 [46] vctrs_0.6.5            Biostrings_2.66.0      preprocessCore_1.60.2 
 [49] xfun_0.49              timechange_0.3.0       lifecycle_1.0.4       
 [52] renv_1.0.11            rstatix_0.7.2          XML_3.99-0.17         
 [55] edgeR_3.40.2           zlibbioc_1.44.0        MASS_7.3-60.0.1       
 [58] scales_1.3.0           vroom_1.6.5            hms_1.1.3             
 [61] parallel_4.2.3         RColorBrewer_1.1-3     yaml_2.3.10           
 [64] memoise_2.0.1          stringi_1.8.4          RSQLite_2.3.7         
 [67] boot_1.3-31            rlang_1.1.4            pkgconfig_2.0.3       
 [70] bitops_1.0-9           evaluate_1.0.1         lattice_0.22-6        
 [73] labeling_0.4.3         htmlwidgets_1.6.4      bit_4.5.0             
 [76] tidyselect_1.2.1       magrittr_2.0.3         R6_2.5.1              
 [79] generics_0.1.3         multcompView_0.1-10    DelayedArray_0.24.0   
 [82] DBI_1.2.3              pillar_1.9.0           withr_3.0.2           
 [85] abind_1.4-8            survival_3.7-0         KEGGREST_1.38.0       
 [88] scatterplot3d_0.3-44   RCurl_1.98-1.16        car_3.1-3             
 [91] crayon_1.5.3           utf8_1.2.4             tzdb_0.4.0            
 [94] rmarkdown_2.29         locfit_1.5-9.10        grid_4.2.3            
 [97] blob_1.2.4             flashClust_1.01-2      digest_0.6.37         
[100] xtable_1.8-4           munsell_0.5.1          viridisLite_0.4.2