Author

Philipp Sven Lars Schäfer

Published

November 28, 2024

1 Packages

Code
suppressPackageStartupMessages({
  library(tidyverse)
  library(ggdark)
  library(factoextra)
  library(FactoMineR)
  library(magick)
  library(ComplexHeatmap)
  library(circlize)
  source(file.path("..", "src", "read_data.R"))
  source(file.path("..", "src", "colors.R"))
})

2 Data

Code
input_dir = file.path("..", "data")
Code
meta_data <- read_harmonized_meta_data(input_dir)

exp_data <- read_raw_experimental_data(input_dir)
#exp_data <- filter_experimental_data(meta_data, exp_data)

celltype_meta <- read_celltype_meta(input_dir)
gene_meta <- read_gene_meta(input_dir)
protein_meta <- read_protein_meta(input_dir)

3 Checks

Code
# stopifnot(all(
#   purrr::map_lgl(exp_data, ~ all(.x$specimen_id %in% meta_data$specimen_id)))
# )

# no subject is recorded in more than one dataset
(meta_data %>%
  dplyr::count(subject_id, dataset) %>%
  dplyr::count(subject_id) %>%
  dplyr::pull(n) == 1) %>%
  all() %>%
  stopifnot()

# we have the baseline specimen (planned_day_relative_to_boost = 0) for every subject
(meta_data %>%
  dplyr::select(subject_id, planned_day_relative_to_boost, dataset) %>%
  dplyr::distinct() %>%
  dplyr::mutate(is_baseline = (planned_day_relative_to_boost==0)) %>%
  dplyr::group_by(subject_id) %>%
  dplyr::summarize(baseline_present = any(is_baseline),
                   dataset = first(dataset)) %>%
  dplyr::group_by(dataset) %>%
  dplyr::summarize(baseline_present_frac = mean(baseline_present)) %>%
  pull(baseline_present_frac) == 1) %>%
  all() %>%
  stopifnot()

# stopifnot(all(
#   purrr::map_lgl(exp_data, function(df) {
#     if ("unit" %in% colnames(df)) {
#       return(length(unique(df[["unit"]])) == 1)
#     } else {
#       return(TRUE)
#     }
#   })
# ))

4 Demography & Metadata

4.1 How many subjects do we have per dataset / per partition?

Code
meta_data %>%
  dplyr::select(subject_id, dataset, partition, infancy_vac) %>%
  dplyr::distinct() %>%
  dplyr::count(dataset, partition, infancy_vac) %>%
  ggplot(aes(x = n, y = dataset, fill = infancy_vac, color = partition)) +
  geom_col(position = position_dodge(width = 0.9), width = 0.7, alpha=0.5) +  
  geom_text(aes(label = n), 
            position = position_dodge(width = 0.9),  
            hjust = -0.2, vjust = 0.5, size = 3, color = "white") +
  scale_fill_manual(values = infancy_vac_colors) +
  scale_color_manual(values = partition_colors) +
  ggdark::dark_mode(verbose = FALSE)

4.2 What is the age range

Code
p1 <- meta_data %>%
  ggplot() + 
  geom_histogram(aes(x=age_at_boost, fill=dataset, y=after_stat(density)), 
                 color="black", position="identity", bins=30, show.legend=FALSE) +
  facet_wrap(~dataset) +
  scale_fill_manual(values=dataset_colors) +
  ggdark::dark_mode(verbose=FALSE)

p2 <- meta_data %>%
  ggplot() +
  geom_violin(aes(y=dataset, x=age_at_boost, fill=dataset)) +
  ggdark::dark_mode(verbose=FALSE) +
  scale_fill_manual(values=dataset_colors)

cowplot::plot_grid(p1, p2, ncol=2, rel_widths=c(1.5,1))

5 Assays & Timepoints per Subject / Dataset

5.1 What is the difference between planned and actual booster administration

Code
meta_data %>%
  ggplot() +
  geom_histogram(aes(x=diff_relative_to_boost), binwidth=1) +
  facet_wrap(~dataset) +
  ggdark::dark_mode(verbose=FALSE)

5.2 What is the difference between planned and actual for the baseline

Code
meta_data %>%
  dplyr::filter(planned_day_relative_to_boost==0) %>%
  dplyr::mutate(`diff_>_15` = abs(actual_day_relative_to_boost) > 15) %>%
  ggplot() +
  geom_histogram(aes(x=actual_day_relative_to_boost, fill=`diff_>_15`), binwidth=1) +
  facet_wrap(~dataset) +
  ggdark::dark_mode(verbose=FALSE)

5.3 How many time points do we have per subject?

Code
meta_data %>%
  dplyr::count(dataset, subject_id) %>%
  ggplot() +
  geom_histogram(aes(x=n, fill=dataset),color="black", binwidth=1) +
  facet_wrap(~dataset, ncol=2) +
  ggdark::dark_mode(verbose=FALSE) +
  scale_x_continuous(breaks=seq(0, 8, 1)) +
  scale_fill_manual(values=dataset_colors)

5.4 How many assays do we have for the baseline measurement?

Code
assays_per_specimen <- purrr::imap(exp_data, ~ .x %>% 
              dplyr::select(specimen_id) %>%
              dplyr::distinct() %>%
              dplyr::mutate(assay=.y)) %>%
  dplyr::bind_rows() %>%
  dplyr::left_join(meta_data, by="specimen_id") %>%
  dplyr::select(specimen_id, assay, subject_id, planned_day_relative_to_boost, infancy_vac, dataset)

assays_per_specimen %>%
  dplyr::count(subject_id, planned_day_relative_to_boost, dataset, specimen_id) %>%
  dplyr::filter(!is.na(subject_id)) %>%
  dplyr::filter(planned_day_relative_to_boost==0) %>%
  ggplot() +
  geom_histogram(aes(x=n, fill=dataset), color="black", binwidth=1) +
  facet_wrap(~dataset, ncol=2) +
  ggdark::dark_mode(verbose=FALSE) +
  scale_x_continuous(breaks=seq(0, 8, 1)) +
  scale_fill_manual(values=dataset_colors)

Code
for (day in c(0, 1, 3, 14)) {
  
annotation_data <- meta_data %>%
  dplyr::select(specimen_id, planned_day_relative_to_boost, infancy_vac, 
                dataset, biological_sex) %>%
  dplyr::filter(specimen_id %in% assays_per_specimen$specimen_id) %>%
  dplyr::filter(planned_day_relative_to_boost == !!day) %>% 
  dplyr::select(- planned_day_relative_to_boost) %>%
  dplyr::arrange(dataset, infancy_vac, biological_sex) %>%
  tibble::column_to_rownames("specimen_id") %>%
  dplyr::select(dataset, infancy_vac, biological_sex)

heatmap_data <- assays_per_specimen %>%
  dplyr::select(specimen_id, assay) %>%
  dplyr::mutate(value = 1) %>%
  tidyr::pivot_wider(names_from="assay", values_from="value") %>%
  mutate(across(everything(), .fns = ~replace_na(.,0))) %>%
  tibble::column_to_rownames("specimen_id") %>%
  as.matrix() %>%
  t()

colnames(heatmap_data) <- as.character(colnames(heatmap_data))
heatmap_data <- heatmap_data[, rownames(annotation_data)]

# Create the heatmap
ht <- Heatmap(heatmap_data, 
              name = "heatmap", 
              row_title = "", 
              column_title = paste0("Specimens for Planned Day ", day),
              col = colorRamp2(c(0, 1), c("white", "black")),
              cluster_rows = FALSE,
              cluster_columns = FALSE,
              show_row_names = TRUE, 
              show_column_names = FALSE,
              width = unit(16, "cm"),
              top_annotation = ComplexHeatmap::HeatmapAnnotation(df = annotation_data,
                                    col = list(
                                      "dataset" = dataset_colors,
                                      "infancy_vac" = infancy_vac_colors,
                                      "biological_sex" = sex_colors
                                    ),
                                    which="column")
)

draw(ht, 
     heatmap_legend_side = "top",
     annotation_legend_side = "top",
     show_heatmap_legend = FALSE # don't show colorbar
     )
}

5.5 How large is the variance in the baseline measurements?

Code
plot_time_course <- function(experimental_data, modality) {
  #experimental_data <- exp_data; modality <- "pbmc_cell_frequency"
  
  modality_settings <- experimental_data_settings[[modality]]
  feature_col <- modality_settings$feature_col
  value_col <- modality_settings$value_col
  
  plot_df <- experimental_data[[modality]] %>%
    dplyr::left_join(meta_data, by="specimen_id") %>%
    dplyr::filter(planned_day_relative_to_boost <= 0)
  
  # only keep datasets for which we have more than one time point
  datasets_to_keep <- plot_df %>%
    dplyr::select(dataset, planned_day_relative_to_boost) %>%
    dplyr::distinct() %>%
    dplyr::group_by(dataset) %>%
    dplyr::summarize(n = n()) %>%
    dplyr::filter(n > 1) %>%
    dplyr::pull(dataset)
  
  plot_df <- plot_df %>%
    dplyr::filter(dataset %in% datasets_to_keep)
  
  if ("feature_subset" %in% names(modality_settings)) {
    plot_df <- plot_df %>% 
      dplyr::filter(.data[[feature_col]] %in% modality_settings$feature_subset)
  }
  plot_df <- plot_df %>%
    dplyr::mutate(specimen_id = factor(specimen_id))
  
  # check for significant slopes?
  pvals <- purrr::map_dfr(unique(plot_df$dataset), function(d) {
    purrr::map_dfr(unique(plot_df[[feature_col]]), function(f) {
      # d=plot_df$dataset[1]; f=plot_df[[feature_col]][1]
      model_df <- plot_df %>%
        dplyr::filter(dataset==d, .data[[feature_col]]==f)
      lin_model <- lm(formula=paste0(value_col, " ~ planned_day_relative_to_boost"),
         data = model_df)
      tibble(dataset=d, feature=f, pval=summary(lin_model)$coefficients[2, 4])
    })
  }) %>%
    dplyr::mutate(padj = stats::p.adjust(pval))
  pvals %>%
    dplyr::filter(padj < 0.05) %>%
    print()
  
  p1 <- plot_df %>%
    dplyr::group_by(subject_id, .data[[feature_col]]) %>%
    dplyr::summarize(min_feat = min(.data[[value_col]]),
                     max_feat = max(.data[[value_col]]),
                     mean_feat = mean(.data[[value_col]]),
                     sd_feat = sd(.data[[value_col]]),
                     .groups = "drop") %>%
    dplyr::mutate(cv = sd_feat / mean_feat) %>%
    ggplot() +
    geom_violin(aes(y=.data[[feature_col]], x=cv)) +
    ggdark::dark_mode() +
    labs(x="Coefficient of Variation (CV)")
  
  p2 <- plot_df %>%
    ggplot(aes(x=planned_day_relative_to_boost, y=.data[[value_col]])) +
    geom_point(aes(color=dataset), alpha=0.5) +
    geom_line(aes(group=subject_id), alpha=0.5) +
    facet_wrap(~.data[[feature_col]], scales="free_y") +
    geom_smooth(aes(color=dataset), method="lm", formula = 'y ~ x') +
    ggdark::dark_mode()
  return(list(p1=p1, p2=p2))
}
Code
plot_time_course(experimental_data=exp_data, modality="pbmc_cell_frequency")
# A tibble: 1 x 4
  dataset      feature      pval   padj
  <chr>        <chr>       <dbl>  <dbl>
1 2022_dataset Monocytes 0.00169 0.0338
$p1


$p2

Code
plot_time_course(experimental_data=exp_data, modality="plasma_ab_titer")
# A tibble: 0 x 4
# i 4 variables: dataset <chr>, feature <chr>, pval <dbl>, padj <dbl>
$p1


$p2

Code
plot_time_course(experimental_data=exp_data, modality="plasma_cytokine_concentration_by_legendplex")
# A tibble: 0 x 4
# i 4 variables: dataset <chr>, feature <chr>, pval <dbl>, padj <dbl>
$p1


$p2

Code
plot_time_course(experimental_data=exp_data, modality="plasma_cytokine_concentration_by_olink")
# A tibble: 0 x 4
# i 4 variables: dataset <chr>, feature <chr>, pval <dbl>, padj <dbl>
$p1
Warning: Removed 65 rows containing non-finite outside the scale range
(`stat_ydensity()`).


$p2
Warning: Removed 67 rows containing non-finite outside the scale range
(`stat_smooth()`).
Warning: Removed 67 rows containing missing values or values outside the scale range
(`geom_point()`).
Warning: Removed 1 row containing missing values or values outside the scale range
(`geom_line()`).

Code
plot_time_course(experimental_data=exp_data, modality="t_cell_activation")
# A tibble: 0 x 4
# i 4 variables: dataset <chr>, feature <chr>, pval <dbl>, padj <dbl>
$p1


$p2

Code
plot_time_course(experimental_data=exp_data, modality="t_cell_polarization")
# A tibble: 0 x 4
# i 4 variables: dataset <chr>, feature <chr>, pval <dbl>, padj <dbl>
$p1
Warning: Removed 39 rows containing non-finite outside the scale range
(`stat_ydensity()`).


$p2

6 Training Data

6.1 How many subjects can we use to construct training data?

6.1.1 1) Antibody Level Tasks (anti-PT 14 days after booster)

  • All but 7 people were assayed approximately 14 days after the booster administration
Code
meta_data %>%
  dplyr::filter(planned_day_relative_to_boost==14) %>%
  ggplot(aes(x=diff_relative_to_boost)) +
  geom_histogram(binwidth=1, color = "white", fill = "blue", alpha=0.1) +
  stat_bin(aes(label = after_stat(count)), binwidth=1, geom = "text", vjust = -0.5, color = "white", size = 3) +  
  ggdark::dark_mode(verbose=FALSE)

Code
meta_data %>%
  dplyr::select(subject_id, infancy_vac, dataset, biological_sex, ethnicity, race, year_of_birth) %>%
  dplyr::distinct() %>%
  dplyr::mutate(task_1_possible = 
                  subject_id %in% (meta_data %>%
                                     dplyr::filter(planned_day_relative_to_boost==14) %>%
                                     dplyr::left_join(exp_data$plasma_ab_titer,
                                                      by="specimen_id") %>%
                                     dplyr::filter(isotype_antigen=="IgG_PT") %>%
                                     tidyr::drop_na() %>%
                                     dplyr::pull(subject_id))
  ) %>%
  dplyr::group_by(dataset) %>%
  dplyr::summarise(task_1_possible_total = sum(task_1_possible), 
                   task_1_possible_fraction = mean(task_1_possible))

6.1.2 2) Cell Frequency Tasks (Monocytes 1 day after booster)

  • But there is one person where it does not really make sense
Code
meta_data %>%
  dplyr::filter(planned_day_relative_to_boost==1) %>%
  ggplot(aes(x=diff_relative_to_boost)) +
  geom_histogram(binwidth=1, color = "white", fill = "blue", alpha=0.1) +
  stat_bin(aes(label = after_stat(count)), binwidth=1, geom = "text", vjust = -0.5, color = "white", size = 3) +  
  ggdark::dark_mode(verbose=FALSE)

Code
meta_data %>%
  dplyr::select(subject_id, infancy_vac, dataset, biological_sex, ethnicity, race, year_of_birth) %>%
  dplyr::distinct() %>%
  dplyr::mutate(task_possible = 
                  subject_id %in% (meta_data %>%
                                     dplyr::filter(planned_day_relative_to_boost==1) %>%
                                     dplyr::left_join(exp_data$pbmc_cell_frequency, by="specimen_id") %>%
                                     dplyr::filter(cell_type_name=="Monocytes") %>%
                                     tidyr::drop_na() %>%
                                     dplyr::pull(subject_id))
  ) %>%
  dplyr::group_by(dataset) %>%
  dplyr::summarise(task_possible_total = sum(task_possible), 
                   task_possible_fraction = mean(task_possible))

6.1.3 3) Gene Expression Tasks (CCL3 expression 3 days after booster)

  • There are 7 people for which this task does nto make sense
Code
meta_data %>%
  dplyr::filter(planned_day_relative_to_boost==3) %>%
  ggplot(aes(x=diff_relative_to_boost)) +
  geom_histogram(binwidth=1, color = "white", fill = "blue", alpha=0.1) +
  stat_bin(aes(label = after_stat(count)), binwidth=1, geom = "text", vjust = -0.5, color = "white", size = 3) +  
  ggdark::dark_mode(verbose=FALSE)

Code
all_genes <- unique(exp_data$pbmc_gene_expression$versioned_ensembl_gene_id)
ccl3_ensembl <- "ENSG00000277632"
ccl3_ensembl_versioned <- all_genes[str_starts(all_genes, ccl3_ensembl)]

meta_data %>%
  dplyr::select(subject_id, infancy_vac, dataset, biological_sex, ethnicity, race, year_of_birth) %>%
  dplyr::distinct() %>%
  dplyr::mutate(task_possible = 
                  subject_id %in% (meta_data %>%
                                     dplyr::filter(planned_day_relative_to_boost==3) %>%
                                     dplyr::left_join(exp_data$pbmc_gene_expression, by="specimen_id") %>%
                                     dplyr::filter(versioned_ensembl_gene_id==ccl3_ensembl_versioned) %>%
                                     tidyr::drop_na() %>%
                                     dplyr::pull(subject_id))
  ) %>%
  dplyr::group_by(dataset) %>%
  dplyr::summarise(task_possible_total = sum(task_possible), 
                   task_possible_fraction = mean(task_possible))

7 Missing Data

7.1 How many NA values do we have per assay?

Code
purrr::imap(exp_data, function(df, modality) {
  feature_col <- experimental_data_settings[[modality]]$feature_col
  value_col <- experimental_data_settings[[modality]]$value_col
  tibble(modality = modality, feature_col = feature_col, sum_na = sum(is.na(df[[value_col]])))
}) %>%
  dplyr::bind_rows()

7.2 Wide format

Code
purrr::imap_dfr(generate_wide_experimental_data(experimental_data=exp_data, 
                                                impute=NULL),
                  function(mtx, modality) {
                    rmeans <- rowMeans(is.na(mtx))
                    tibble(modality=modality, 
                           subject_id = names(rmeans), 
                           na_frac=rmeans)
           }) %>%
  dplyr::group_by(modality) %>%
  dplyr::summarise(mean_na_frac = mean(na_frac))
pbmc_cell_frequency | NA Fraction: 0.320140362330668 | Removed features: ASCs_(Plasmablasts), CD19_N_CD3_N_, CD19_P__CD3_N_, CD3_N_CD19_N_CD56_N_HLA_N_DR_P_CD14_N_CD16_N__cells, CD45_P_, CD56_P__CD3high_T_cells, Lineage_negative_cells_(CD3_N_CD19_N_CD56_N_HLA_N_DR_N_), TB_doublets, T_cells_(CD19_N_CD3_P_CD14_N_), CD33HLADR, CD3CD19, Tregs, mDC, MB_doublets_(CD19_P_CD14_P_), B_NK_doublets_(CD19_P_CD3_N_CD14_N_CD56_P_), T_M_doublets_(CD19_N_CD3_P_CD14_P_), Lineage__N__cells_(CD3_N_CD19_N_CD56_N_HLA_N_DR_N_), Antibody_secreting_B_cells_(ASCs), BNK_doublets, CD19_P_CD3_N_, CD3_N_CD19_N_, CD3_N_CD19_N_CD56_N_HLA_N_DR_N_CD14_N_CD16_N__cells, CD56_P_CD3high_T_cells, Lineage_N__cells_(CD3_N_CD19_N_CD56_N_HLA_N_DR_N_), MB_doublets, T_cells_(CD19_N_CD3_N_CD14_N_), TM_doublets
Warning: Values from `MFI` are not uniquely identified; output will contain list-cols.
* Use `values_fn = list` to suppress this warning.
* Use `values_fn = {summary_fun}` to summarise duplicates.
* Use the following dplyr code to identify duplicates.
  {data} |>
  dplyr::summarise(n = dplyr::n(), .by = c(specimen_id, isotype_antigen)) |>
  dplyr::filter(n > 1L)
plasma_cytokine_concentration_by_olink | NA Fraction: 0.700620342049714 | Removed features: Q969D9, Q96SB3, P16278, O75475, Q05516, Q9NWZ3, Q15661, P14317, Q9UHC6, Q6UXB4, Q00978, Q9UNE0, Q13574, Q8WTT0, P51617, Q9UMR7, Q06830, P30048, P09038, P30044, Q8N608, Q9C035, Q14203, P23229, Q15517, Q14435, Q96DB9, Q12933, P19474, Q8NHJ6, P34130, P08727, O43736, P50135, Q7Z6M3, Q9GZT9, Q12968, O60449, P63241, Q04637, P10747, Q03431, Q13490, P28845, P35240, Q9HCM2, Q9UQQ2, Q96P31, Q07065, P05412, O94992, Q8WXI8, Q04759, P16455, Q9NP99, P78310, P78362, Q13241, O14867, Q6ZUJ8, O43597, P52823, P27540, P58499, O60880, Q05084, O00273, Q96PD2, Q6DN72, O76036, P15514, Q8IU57, Q9UN19, Q9Y2J8, Q9Y3P8, P48740, Q9UQV4, Q9BXN2, Q6EIG7, O95786, P42701, Q92844, Q9UKX5, P52294, P18627, P05113, Q01151, P18564, P78410, Q07011, Q02763, P29965, P01583, Q9BZW8, Q15389, P49763, Q9Y653, O95727, P01732, Q16790, P01574, P00813, P01730, P29474, O00182, P35968, P25942, P20718, P49767, P01137, P09341, O43557, P01127, Q15116, P48023, Q14213,P29459, P09382, Q9NZQ7, P26842, P42830, P12544, P09601, P78423, P32970, Q9NP84, P55773, P06127, P09237, P07585, O75509, P43489, Q29983,Q29980, Q92583, P21246, Q14790, O75144, O43927, Q9BQ51, Q9HBE4, P78556, P10144, P29459,P29460, Q9H6B4, Q96JA1, O95502, P23526, P52888, P43234, Q96LA6, Q04900, P20711, Q9NPH0, Q03403, P25815, O15123, Q9Y5K6, O43827, Q9NY25, Q9GZM7, P35754, P09104, O95544, Q9UBU3, P50452, P35237, Q9HBB8, Q76M96, Q9NR28, Q8N1Q1, Q13275, O43240, Q9UKJ0, O95841, P51693, Q8IZP9, P19971, O75791, A6NI73, P00352, P40259, P09525, P50995, Q9Y286, P26010, P09417, O00161, P27695, O75356, Q9H4D0, P21964, Q15846, P51858, Q6WN34, P09668, Q15155, P16083, Q9BQB4, Q92520, Q8NBS9, P41236, Q9UHL4, Q86VZ4, Q9UHX3, Q6UWV6, Q8WTU2, Q8NI22, Q9BYZ8, Q8NBJ7, Q8WVQ1, P29017, P22466, P19022, Q06418, P46109, Q8WX77, Q9BZR6, P13611, P09467, P01222, P46379, Q92692, P05089, P40818, Q02790, P31431, Q9NWQ8, Q16773, P98082, Q9NQX5, Q641Q3, Q16820, Q01973, NT_N_proBNP, P12724
plasma_cytokine_concentration_by_olink | NA Fraction: 0.700620342049714 | Removed samples: 760, 750, 903, 824, 894, 833
t_cell_activation | NA Fraction: 0.0576923076923077 | Removed samples: 681
t_cell_polarization | NA Fraction: 0.0165394402035623 | Removed samples: 1159, 529, 534, 643, 648

7.3 What is the fraction of NA values in the PBMC Cell Type Frequencies

7.3.1 Per Cell Type

Code
exp_data$pbmc_cell_frequency %>%
  dplyr::mutate(is_na = is.na(percent_live_cell)) %>%
  dplyr::group_by(cell_type_name) %>%
  dplyr::summarise(frac_na = sum(is_na) / n()) %>%
  dplyr::arrange(desc(frac_na)) %>%
  dplyr::mutate(cell_type_name = factor(cell_type_name, levels=rev(cell_type_name))) %>%
  ggplot() +
  geom_col(aes(y=cell_type_name, x=frac_na)) +
  ggdark::dark_mode(verbose=FALSE) +
  labs(x="Fraction of NA Values per Cell Type")

7.3.2 Per Specimen

So there are 5 specimen with more than 20% NA, which are all from the same subject!

Code
exp_data$pbmc_cell_frequency %>%
  dplyr::mutate(is_na = is.na(percent_live_cell)) %>%
  dplyr::group_by(specimen_id) %>%
  dplyr::summarise(frac_na = sum(is_na) / n()) %>%
  ggplot(aes(x = frac_na)) +
  geom_histogram(bins = 20, color = "white", fill = "blue", alpha=0.1) +  # Add color for better visibility
  stat_bin(aes(label = after_stat(count)), bins = 20, geom = "text", 
           vjust = -0.5, color = "white", size = 3) +  # Display bin counts at the top
  ggdark::dark_mode(verbose=FALSE) +
  labs(x="Fraction of NA Values per Specimen")

8 Different Units that are used

Code
purrr::map_lgl(exp_data, function(df) {
    if ("unit" %in% colnames(df)) {
      return(length(unique(df[["unit"]])) == 1)
    } else {
      return(TRUE)
    }
  })
                        pbmc_cell_frequency 
                                       TRUE 
                       pbmc_gene_expression 
                                       TRUE 
                            plasma_ab_titer 
                                      FALSE 
plasma_cytokine_concentration_by_legendplex 
                                       TRUE 
     plasma_cytokine_concentration_by_olink 
                                      FALSE 
                          t_cell_activation 
                                       TRUE 
                        t_cell_polarization 
                                       TRUE 

Check if this is also the case in the harmonized data. Yes it is. So how to deal with this?

Code
harm_exp_data <- read_harmonized_experimental_data_depr(input_dir)

purrr::map_lgl(harm_exp_data, function(df) {
    if ("unit" %in% colnames(df)) {
      return(length(unique(df[["unit"]])) == 1)
    } else {
      return(TRUE)
    }
  })
                         pbmc_cell_frequency 
                                        TRUE 
                        pbmc_gene_expression 
                                        TRUE 
                      plasma_antibody_levels 
                                       FALSE 
plasma_cytokine_concentrations_by_legendplex 
                                        TRUE 
     plasma_cytokine_concentrations_by_olink 
                                       FALSE 
                           t_cell_activation 
                                        TRUE 
                         t_cell_polarization 
                                        TRUE 
Code
rm(harm_exp_data)

8.1 plasma_ab_titer

  • See also: https://discuss.cmi-pb.org/t/multiple-units-in-the-ab-titer-table/57/3
Code
exp_data$plasma_ab_titer %>%
  dplyr::left_join(meta_data, by="specimen_id") %>%
  dplyr::count(dataset, unit)

9 Question for Assay Metadata

9.3 How often are the Antibody Titers below the LOD

Code
exp_data$plasma_ab_titer %>%
  dplyr::inner_join(meta_data, by="specimen_id") %>%
  dplyr::mutate(below_lod = MFI < lower_limit_of_detection) %>%
  dplyr::group_by(dataset) %>%
  dplyr::summarise(below_lod = mean(below_lod, na.rm=TRUE))

9.4 What is the fraction of antigen-specific antibody measurements

9.4.1 Per Isotype-Antigen

Code
exp_data$plasma_ab_titer %>%
  dplyr::group_by(isotype_antigen) %>%
  dplyr::summarize(is_antigen_specific_fraction = mean(is_antigen_specific)) %>%
  ggplot(aes(x=is_antigen_specific_fraction)) +
  geom_histogram(bins=30, color = "white", fill = "blue", alpha=0.1) +
  stat_bin(aes(label = after_stat(count)), bins=30, geom = "text", vjust = -0.5, color = "white", size = 3) +  
  ggdark::dark_mode(verbose=FALSE)

Code
exp_data$plasma_ab_titer %>%
  dplyr::group_by(isotype_antigen) %>%
  dplyr::summarize(is_antigen_specific_fraction = mean(is_antigen_specific)) %>%
  dplyr::arrange(is_antigen_specific_fraction)

So the top entry makes sense, IgE_Total does not refer to any specific antigen

9.4.2 Per Specimen

So in some specimen we have non-specific antibody measurements? What does that mean?

Code
exp_data$plasma_ab_titer %>%
  dplyr::group_by(specimen_id) %>%
  dplyr::summarize(is_antigen_specific_fraction = mean(is_antigen_specific)) %>%
  ggplot(aes(x=is_antigen_specific_fraction)) +
  geom_histogram(binwidth=1, color = "white", fill = "blue", alpha=0.1) +
  stat_bin(aes(label = after_stat(count)), binwidth=1, geom = "text", vjust = -0.5, color = "white", size = 3) +  
  ggdark::dark_mode(verbose=FALSE)

9.4.3 Per Year

So this is very weird. For 2021, no antibody measurement is antigen-specific? Maybe this is a bug!

Code
exp_data$plasma_ab_titer %>%
  dplyr::left_join(meta_data, by="specimen_id") %>%
  dplyr::group_by(dataset) %>%
  dplyr::summarize(mean_is_antigen_specific = mean(is_antigen_specific))

10 Hierarchy in PBMC Frequency Data

10.1 NK

There are 3 types of NK cells:

  • NK cells (CD3-CD19-CD56+): CD19-CD3-CD56+/++
  • NK: CD19-CD3-CD56+
  • CD56high NK cells: CD19-CD3-CD56++

And I don’t know what to do with that information!

10.2 No Gating Info for 2023

  • I just assume it is the same as in 2022, but this might not be true since the data look so different

10.3 No Gating Info for Basophils and CD19 in 2020

  • I just assume it is the same as in 2021 and 2022

10.4 No Gating Info for non-pDCs at all

Not sure how to fix this)

Code
exp_data$pbmc_cell_frequency %>%
  dplyr::left_join((meta_data %>%
                      dplyr::select(specimen_id, dataset) %>%
                      dplyr::distinct()),
                   by="specimen_id") %>%
  dplyr::left_join((celltype_meta %>%
                      dplyr::select(cell_type_name, dataset, level) %>%
                      dplyr::distinct()),
                    by=c("dataset", "cell_type_name")) %>%
  dplyr::filter(is.na(level)) %>%
  dplyr::select(dataset, cell_type_name) %>%
  dplyr::distinct()

10.5 Should the Proportions sum up to 1 at level 0?

  • Trying to take the hierarchical information into account, it still does not make sense
Code
exp_data$pbmc_cell_frequency %>%
  dplyr::left_join((meta_data %>%
                      dplyr::select(specimen_id, dataset) %>%
                      dplyr::distinct()),
                   by="specimen_id") %>%
  dplyr::left_join((celltype_meta %>%
                      dplyr::select(cell_type_name, dataset, level) %>%
                      dplyr::distinct()),
                    by=c("dataset", "cell_type_name")) %>%
  dplyr::filter(level == 0) %>%
  dplyr::group_by(specimen_id) %>%
  dplyr::summarize(percent_live_cell_sum = sum(percent_live_cell, na.rm=TRUE)) %>%
  dplyr::left_join(meta_data, by="specimen_id") %>%
  ggplot() +
  geom_violin(aes(x=dataset, y=percent_live_cell_sum)) +
  ggdark::dark_mode(verbose=FALSE)

10.6 Difference in 2023 Dataset

  • It seems like the 2023 data are very very different, which makes me hesitant to use it for any kind of predictions on the challenge data. E.g. just looking at level 0 which should make sense I get the following
Code
# first check for NAs for the levels
exp_data$pbmc_cell_frequency %>%
  dplyr::left_join((meta_data %>%
                      dplyr::select(specimen_id, dataset) %>%
                      dplyr::distinct()),
                   by="specimen_id") %>%
  dplyr::left_join((celltype_meta %>%
                      dplyr::select(cell_type_name, dataset, level) %>%
                      dplyr::distinct()),
                    by=c("dataset", "cell_type_name")) %>%
  dplyr::filter(is.na(level))
Code
exp_data$pbmc_cell_frequency %>%
  dplyr::left_join((meta_data %>%
                      dplyr::select(specimen_id, dataset) %>%
                      dplyr::distinct()),
                   by="specimen_id") %>%
  dplyr::left_join((celltype_meta %>%
                      dplyr::select(cell_type_name, dataset, level) %>%
                      dplyr::distinct()),
                    by=c("dataset", "cell_type_name")) %>%
  dplyr::filter(level == 0) %>%
  dplyr::group_by(specimen_id) %>%
  dplyr::summarize(percent_live_cell_sum = sum(percent_live_cell, na.rm=TRUE)) %>%
  dplyr::left_join(meta_data, by="specimen_id") %>%
  ggplot() +
  geom_violin(aes(x=dataset, y=percent_live_cell_sum)) +
  ggdark::dark_mode(verbose=FALSE)

  • Especially, when completely ignoring the hierarchy in the gating information, I would expect to have more than 100% in percelt_live_cell_sum, but this is not true for the specimens from the 2023 dataset
Code
exp_data$pbmc_cell_frequency %>%
  dplyr::left_join((meta_data %>%
                      dplyr::select(specimen_id, dataset) %>%
                      dplyr::distinct()),
                   by="specimen_id") %>%
  dplyr::left_join((celltype_meta %>%
                      dplyr::select(cell_type_name, dataset, level) %>%
                      dplyr::distinct()),
                    by=c("dataset", "cell_type_name")) %>%
  dplyr::group_by(specimen_id) %>%
  dplyr::summarize(percent_live_cell_sum = sum(percent_live_cell, na.rm=TRUE)) %>%
  dplyr::left_join(meta_data, by="specimen_id") %>%
  ggplot() +
  geom_violin(aes(x=dataset, y=percent_live_cell_sum)) +
  ggdark::dark_mode(verbose=FALSE)

  • Or what I use a list of predefined cell types of interest?

11 Features for T cell Activation Assay (AIM)

Code
exp_data$t_cell_activation %>%
  dplyr::select(stimulation) %>%
  dplyr::distinct()
Code
exp_data$t_cell_activation %>%
  ggplot() +
  geom_histogram(aes(x=analyte_percentages), bins=100) +
  facet_wrap(~stimulation) +
  ggdark::dark_mode()

12 Features for T cell Polarization Assay (FluoroSpot)

Code
exp_data$t_cell_polarization %>%
  dplyr::left_join(protein_meta, by=c("protein_id"="uniprot_id")) %>%
  dplyr::select(protein_id, cytokine) %>%
  dplyr::distinct()
Code
exp_data$t_cell_polarization %>%
  dplyr::select(stimulation) %>%
  dplyr::distinct()

where

  • DMSO: “Negative” Control (solvent where peptides are dissolved in)

  • PHA (Phytohaemagglutinin): “Postive” Control (this should elicit strong T cell response?)

  • PT: Bordetella Pertussis Toxin

Code
exp_data$t_cell_polarization %>%
  dplyr::left_join(protein_meta, by=c("protein_id"="uniprot_id")) %>%
  ggplot() +
  geom_histogram(aes(x=analyte_counts), bins=100) +
  facet_grid(rows=vars(stimulation), cols=vars(cytokine)) +
  scale_x_log10() +
  ggdark::dark_mode()

Code
# TODO: the assay has been normalized relative to DMSO
exp_data$t_cell_polarization %>%
  dplyr::filter(stimulation=="DMSO") %>%
  dplyr::pull(analyte_counts) %>%
  dplyr::near(., 1) %>%
  mean()
[1] 0.997416
Code
exp_data$t_cell_polarization %>%
  dplyr::filter(stimulation!="DMSO") %>%
  dplyr::pull(stimulation_protein_id) %>%
  unique()
[1] "PHA_P01579" "PHA_Q16552" "PHA_P05113" "PT_P01579"  "PT_Q16552" 
[6] "PT_P05113" 

13 Other Notes

  • Only from 2022 onwards do we have time-course baseline measurements, i.e. -30, -14/15, 0
Code
meta_data %>%
  dplyr::count(partition, dataset, planned_day_relative_to_boost) %>%
  dplyr::filter(planned_day_relative_to_boost < 0)

14 Appendix

Code
sessionInfo()
R version 4.2.3 (2023-03-15)
Platform: aarch64-apple-darwin20 (64-bit)
Running under: macOS 14.6.1

Matrix products: default
BLAS:   /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/lib/libRblas.0.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/lib/libRlapack.dylib

locale:
[1] C

attached base packages:
[1] grid      stats     graphics  grDevices datasets  utils     methods  
[8] base     

other attached packages:
 [1] circlize_0.4.16       ComplexHeatmap_2.15.4 magick_2.8.3         
 [4] FactoMineR_2.11       factoextra_1.0.7      ggdark_0.2.1         
 [7] lubridate_1.9.3       forcats_1.0.0         stringr_1.5.1        
[10] dplyr_1.1.4           purrr_1.0.2           readr_2.1.5          
[13] tidyr_1.3.1           tibble_3.2.1          ggplot2_3.5.1        
[16] tidyverse_2.0.0      

loaded via a namespace (and not attached):
 [1] splines_4.2.3        bit64_4.5.2          vroom_1.6.5         
 [4] jsonlite_1.8.9       foreach_1.5.2        BiocManager_1.30.25 
 [7] stats4_4.2.3         renv_1.0.11          yaml_2.3.10         
[10] ggrepel_0.9.6        pillar_1.9.0         lattice_0.22-6      
[13] glue_1.8.0           digest_0.6.37        RColorBrewer_1.1-3  
[16] colorspace_2.1-1     Matrix_1.6-5         cowplot_1.1.3       
[19] htmltools_0.5.8.1    pkgconfig_2.0.3      GetoptLong_1.0.5    
[22] xtable_1.8-4         mvtnorm_1.3-1        scales_1.3.0        
[25] tzdb_0.4.0           timechange_0.3.0     emmeans_1.10.5      
[28] mgcv_1.9-1           farver_2.1.2         generics_0.1.3      
[31] IRanges_2.32.0       DT_0.33              withr_3.0.2         
[34] BiocGenerics_0.44.0  cli_3.6.3            magrittr_2.0.3      
[37] crayon_1.5.3         estimability_1.5.1   evaluate_1.0.1      
[40] fansi_1.0.6          nlme_3.1-166         doParallel_1.0.17   
[43] MASS_7.3-60.0.1      tools_4.2.3          hms_1.1.3           
[46] GlobalOptions_0.1.2  lifecycle_1.0.4      matrixStats_1.4.1   
[49] S4Vectors_0.36.2     munsell_0.5.1        cluster_2.1.6       
[52] flashClust_1.01-2    compiler_4.2.3       multcompView_0.1-10 
[55] rlang_1.1.4          iterators_1.0.14     rjson_0.2.23        
[58] htmlwidgets_1.6.4    leaps_3.2            labeling_0.4.3      
[61] rmarkdown_2.29       gtable_0.3.6         codetools_0.2-20    
[64] R6_2.5.1             knitr_1.49           bit_4.5.0           
[67] fastmap_1.2.0        utf8_1.2.4           clue_0.3-65         
[70] shape_1.4.6.1        stringi_1.8.4        parallel_4.2.3      
[73] Rcpp_1.0.13-1        vctrs_0.6.5          png_0.1-8           
[76] scatterplot3d_0.3-44 tidyselect_1.2.1     xfun_0.49           
[79] coda_0.19-4.1