Data Overview

Author

Philipp Sven Lars Schäfer

Published

November 28, 2024

1 Packages

Code

suppressPackageStartupMessages({
  library(tidyverse)
  library(ggdark)
  library(factoextra)
  library(FactoMineR)
  library(magick)
  library(ComplexHeatmap)
  library(circlize)
  source(file.path("..", "src", "read_data.R"))
  source(file.path("..", "src", "colors.R"))
})

2 Data

Code

input_dir = file.path("..", "data")

Code

meta_data <- read_harmonized_meta_data(input_dir)

exp_data <- read_raw_experimental_data(input_dir)
#exp_data <- filter_experimental_data(meta_data, exp_data)

celltype_meta <- read_celltype_meta(input_dir)
gene_meta <- read_gene_meta(input_dir)
protein_meta <- read_protein_meta(input_dir)

3 Checks

Code

# stopifnot(all(
#   purrr::map_lgl(exp_data, ~ all(.x$specimen_id %in% meta_data$specimen_id)))
# )

# no subject is recorded in more than one dataset
(meta_data %>%
  dplyr::count(subject_id, dataset) %>%
  dplyr::count(subject_id) %>%
  dplyr::pull(n) == 1) %>%
  all() %>%
  stopifnot()

# we have the baseline specimen (planned_day_relative_to_boost = 0) for every subject
(meta_data %>%
  dplyr::select(subject_id, planned_day_relative_to_boost, dataset) %>%
  dplyr::distinct() %>%
  dplyr::mutate(is_baseline = (planned_day_relative_to_boost==0)) %>%
  dplyr::group_by(subject_id) %>%
  dplyr::summarize(baseline_present = any(is_baseline),
                   dataset = first(dataset)) %>%
  dplyr::group_by(dataset) %>%
  dplyr::summarize(baseline_present_frac = mean(baseline_present)) %>%
  pull(baseline_present_frac) == 1) %>%
  all() %>%
  stopifnot()

# stopifnot(all(
#   purrr::map_lgl(exp_data, function(df) {
#     if ("unit" %in% colnames(df)) {
#       return(length(unique(df[["unit"]])) == 1)
#     } else {
#       return(TRUE)
#     }
#   })
# ))

4 Demography & Metadata

4.1 How many subjects do we have per dataset / per partition?

Code

meta_data %>%
  dplyr::select(subject_id, dataset, partition, infancy_vac) %>%
  dplyr::distinct() %>%
  dplyr::count(dataset, partition, infancy_vac) %>%
  ggplot(aes(x = n, y = dataset, fill = infancy_vac, color = partition)) +
  geom_col(position = position_dodge(width = 0.9), width = 0.7, alpha=0.5) +  
  geom_text(aes(label = n), 
            position = position_dodge(width = 0.9),  
            hjust = -0.2, vjust = 0.5, size = 3, color = "white") +
  scale_fill_manual(values = infancy_vac_colors) +
  scale_color_manual(values = partition_colors) +
  ggdark::dark_mode(verbose = FALSE)

4.2 What is the age range

Code

p1 <- meta_data %>%
  ggplot() + 
  geom_histogram(aes(x=age_at_boost, fill=dataset, y=after_stat(density)), 
                 color="black", position="identity", bins=30, show.legend=FALSE) +
  facet_wrap(~dataset) +
  scale_fill_manual(values=dataset_colors) +
  ggdark::dark_mode(verbose=FALSE)

p2 <- meta_data %>%
  ggplot() +
  geom_violin(aes(y=dataset, x=age_at_boost, fill=dataset)) +
  ggdark::dark_mode(verbose=FALSE) +
  scale_fill_manual(values=dataset_colors)

cowplot::plot_grid(p1, p2, ncol=2, rel_widths=c(1.5,1))

5 Assays & Timepoints per Subject / Dataset

5.1 What is the difference between planned and actual booster administration

Code

meta_data %>%
  ggplot() +
  geom_histogram(aes(x=diff_relative_to_boost), binwidth=1) +
  facet_wrap(~dataset) +
  ggdark::dark_mode(verbose=FALSE)

5.2 What is the difference between planned and actual for the baseline

Code

meta_data %>%
  dplyr::filter(planned_day_relative_to_boost==0) %>%
  dplyr::mutate(`diff_>_15` = abs(actual_day_relative_to_boost) > 15) %>%
  ggplot() +
  geom_histogram(aes(x=actual_day_relative_to_boost, fill=`diff_>_15`), binwidth=1) +
  facet_wrap(~dataset) +
  ggdark::dark_mode(verbose=FALSE)

5.3 How many time points do we have per subject?

Code

meta_data %>%
  dplyr::count(dataset, subject_id) %>%
  ggplot() +
  geom_histogram(aes(x=n, fill=dataset),color="black", binwidth=1) +
  facet_wrap(~dataset, ncol=2) +
  ggdark::dark_mode(verbose=FALSE) +
  scale_x_continuous(breaks=seq(0, 8, 1)) +
  scale_fill_manual(values=dataset_colors)

5.4 How many assays do we have for the baseline measurement?

Code

assays_per_specimen <- purrr::imap(exp_data, ~ .x %>% 
              dplyr::select(specimen_id) %>%
              dplyr::distinct() %>%
              dplyr::mutate(assay=.y)) %>%
  dplyr::bind_rows() %>%
  dplyr::left_join(meta_data, by="specimen_id") %>%
  dplyr::select(specimen_id, assay, subject_id, planned_day_relative_to_boost, infancy_vac, dataset)

assays_per_specimen %>%
  dplyr::count(subject_id, planned_day_relative_to_boost, dataset, specimen_id) %>%
  dplyr::filter(!is.na(subject_id)) %>%
  dplyr::filter(planned_day_relative_to_boost==0) %>%
  ggplot() +
  geom_histogram(aes(x=n, fill=dataset), color="black", binwidth=1) +
  facet_wrap(~dataset, ncol=2) +
  ggdark::dark_mode(verbose=FALSE) +
  scale_x_continuous(breaks=seq(0, 8, 1)) +
  scale_fill_manual(values=dataset_colors)

Code

for (day in c(0, 1, 3, 14)) {
  
annotation_data <- meta_data %>%
  dplyr::select(specimen_id, planned_day_relative_to_boost, infancy_vac, 
                dataset, biological_sex) %>%
  dplyr::filter(specimen_id %in% assays_per_specimen$specimen_id) %>%
  dplyr::filter(planned_day_relative_to_boost == !!day) %>% 
  dplyr::select(- planned_day_relative_to_boost) %>%
  dplyr::arrange(dataset, infancy_vac, biological_sex) %>%
  tibble::column_to_rownames("specimen_id") %>%
  dplyr::select(dataset, infancy_vac, biological_sex)

heatmap_data <- assays_per_specimen %>%
  dplyr::select(specimen_id, assay) %>%
  dplyr::mutate(value = 1) %>%
  tidyr::pivot_wider(names_from="assay", values_from="value") %>%
  mutate(across(everything(), .fns = ~replace_na(.,0))) %>%
  tibble::column_to_rownames("specimen_id") %>%
  as.matrix() %>%
  t()

colnames(heatmap_data) <- as.character(colnames(heatmap_data))
heatmap_data <- heatmap_data[, rownames(annotation_data)]

# Create the heatmap
ht <- Heatmap(heatmap_data, 
              name = "heatmap", 
              row_title = "", 
              column_title = paste0("Specimens for Planned Day ", day),
              col = colorRamp2(c(0, 1), c("white", "black")),
              cluster_rows = FALSE,
              cluster_columns = FALSE,
              show_row_names = TRUE, 
              show_column_names = FALSE,
              width = unit(16, "cm"),
              top_annotation = ComplexHeatmap::HeatmapAnnotation(df = annotation_data,
                                    col = list(
                                      "dataset" = dataset_colors,
                                      "infancy_vac" = infancy_vac_colors,
                                      "biological_sex" = sex_colors
                                    ),
                                    which="column")
)

draw(ht, 
     heatmap_legend_side = "top",
     annotation_legend_side = "top",
     show_heatmap_legend = FALSE # don't show colorbar
     )
}

5.5 How large is the variance in the baseline measurements?

Code

plot_time_course <- function(experimental_data, modality) {
  #experimental_data <- exp_data; modality <- "pbmc_cell_frequency"
  
  modality_settings <- experimental_data_settings[[modality]]
  feature_col <- modality_settings$feature_col
  value_col <- modality_settings$value_col
  
  plot_df <- experimental_data[[modality]] %>%
    dplyr::left_join(meta_data, by="specimen_id") %>%
    dplyr::filter(planned_day_relative_to_boost <= 0)
  
  # only keep datasets for which we have more than one time point
  datasets_to_keep <- plot_df %>%
    dplyr::select(dataset, planned_day_relative_to_boost) %>%
    dplyr::distinct() %>%
    dplyr::group_by(dataset) %>%
    dplyr::summarize(n = n()) %>%
    dplyr::filter(n > 1) %>%
    dplyr::pull(dataset)
  
  plot_df <- plot_df %>%
    dplyr::filter(dataset %in% datasets_to_keep)
  
  if ("feature_subset" %in% names(modality_settings)) {
    plot_df <- plot_df %>% 
      dplyr::filter(.data[[feature_col]] %in% modality_settings$feature_subset)
  }
  plot_df <- plot_df %>%
    dplyr::mutate(specimen_id = factor(specimen_id))
  
  # check for significant slopes?
  pvals <- purrr::map_dfr(unique(plot_df$dataset), function(d) {
    purrr::map_dfr(unique(plot_df[[feature_col]]), function(f) {
      # d=plot_df$dataset[1]; f=plot_df[[feature_col]][1]
      model_df <- plot_df %>%
        dplyr::filter(dataset==d, .data[[feature_col]]==f)
      lin_model <- lm(formula=paste0(value_col, " ~ planned_day_relative_to_boost"),
         data = model_df)
      tibble(dataset=d, feature=f, pval=summary(lin_model)$coefficients[2, 4])
    })
  }) %>%
    dplyr::mutate(padj = stats::p.adjust(pval))
  pvals %>%
    dplyr::filter(padj < 0.05) %>%
    print()
  
  p1 <- plot_df %>%
    dplyr::group_by(subject_id, .data[[feature_col]]) %>%
    dplyr::summarize(min_feat = min(.data[[value_col]]),
                     max_feat = max(.data[[value_col]]),
                     mean_feat = mean(.data[[value_col]]),
                     sd_feat = sd(.data[[value_col]]),
                     .groups = "drop") %>%
    dplyr::mutate(cv = sd_feat / mean_feat) %>%
    ggplot() +
    geom_violin(aes(y=.data[[feature_col]], x=cv)) +
    ggdark::dark_mode() +
    labs(x="Coefficient of Variation (CV)")
  
  p2 <- plot_df %>%
    ggplot(aes(x=planned_day_relative_to_boost, y=.data[[value_col]])) +
    geom_point(aes(color=dataset), alpha=0.5) +
    geom_line(aes(group=subject_id), alpha=0.5) +
    facet_wrap(~.data[[feature_col]], scales="free_y") +
    geom_smooth(aes(color=dataset), method="lm", formula = 'y ~ x') +
    ggdark::dark_mode()
  return(list(p1=p1, p2=p2))
}

Code

plot_time_course(experimental_data=exp_data, modality="pbmc_cell_frequency")

# A tibble: 1 x 4
  dataset      feature      pval   padj
  <chr>        <chr>       <dbl>  <dbl>
1 2022_dataset Monocytes 0.00169 0.0338

$p1

$p2

Code

plot_time_course(experimental_data=exp_data, modality="plasma_ab_titer")

# A tibble: 0 x 4
# i 4 variables: dataset <chr>, feature <chr>, pval <dbl>, padj <dbl>

$p1

$p2

Code

plot_time_course(experimental_data=exp_data, modality="plasma_cytokine_concentration_by_legendplex")

# A tibble: 0 x 4
# i 4 variables: dataset <chr>, feature <chr>, pval <dbl>, padj <dbl>

$p1

$p2

Code

plot_time_course(experimental_data=exp_data, modality="plasma_cytokine_concentration_by_olink")

# A tibble: 0 x 4
# i 4 variables: dataset <chr>, feature <chr>, pval <dbl>, padj <dbl>

$p1

Warning: Removed 65 rows containing non-finite outside the scale range
(`stat_ydensity()`).

$p2

Warning: Removed 67 rows containing non-finite outside the scale range
(`stat_smooth()`).

Warning: Removed 67 rows containing missing values or values outside the scale range
(`geom_point()`).

Warning: Removed 1 row containing missing values or values outside the scale range
(`geom_line()`).

Code

plot_time_course(experimental_data=exp_data, modality="t_cell_activation")

# A tibble: 0 x 4
# i 4 variables: dataset <chr>, feature <chr>, pval <dbl>, padj <dbl>

$p1

$p2

Code

plot_time_course(experimental_data=exp_data, modality="t_cell_polarization")

# A tibble: 0 x 4
# i 4 variables: dataset <chr>, feature <chr>, pval <dbl>, padj <dbl>

$p1

Warning: Removed 39 rows containing non-finite outside the scale range
(`stat_ydensity()`).

$p2

6 Training Data

6.1 How many subjects can we use to construct training data?

6.1.1 1) Antibody Level Tasks (anti-PT 14 days after booster)

All but 7 people were assayed approximately 14 days after the booster administration

Code

meta_data %>%
  dplyr::filter(planned_day_relative_to_boost==14) %>%
  ggplot(aes(x=diff_relative_to_boost)) +
  geom_histogram(binwidth=1, color = "white", fill = "blue", alpha=0.1) +
  stat_bin(aes(label = after_stat(count)), binwidth=1, geom = "text", vjust = -0.5, color = "white", size = 3) +  
  ggdark::dark_mode(verbose=FALSE)

Code

meta_data %>%
  dplyr::select(subject_id, infancy_vac, dataset, biological_sex, ethnicity, race, year_of_birth) %>%
  dplyr::distinct() %>%
  dplyr::mutate(task_1_possible = 
                  subject_id %in% (meta_data %>%
                                     dplyr::filter(planned_day_relative_to_boost==14) %>%
                                     dplyr::left_join(exp_data$plasma_ab_titer,
                                                      by="specimen_id") %>%
                                     dplyr::filter(isotype_antigen=="IgG_PT") %>%
                                     tidyr::drop_na() %>%
                                     dplyr::pull(subject_id))
  ) %>%
  dplyr::group_by(dataset) %>%
  dplyr::summarise(task_1_possible_total = sum(task_1_possible), 
                   task_1_possible_fraction = mean(task_1_possible))

6.1.2 2) Cell Frequency Tasks (Monocytes 1 day after booster)

But there is one person where it does not really make sense

Code

meta_data %>%
  dplyr::filter(planned_day_relative_to_boost==1) %>%
  ggplot(aes(x=diff_relative_to_boost)) +
  geom_histogram(binwidth=1, color = "white", fill = "blue", alpha=0.1) +
  stat_bin(aes(label = after_stat(count)), binwidth=1, geom = "text", vjust = -0.5, color = "white", size = 3) +  
  ggdark::dark_mode(verbose=FALSE)

Code

meta_data %>%
  dplyr::select(subject_id, infancy_vac, dataset, biological_sex, ethnicity, race, year_of_birth) %>%
  dplyr::distinct() %>%
  dplyr::mutate(task_possible = 
                  subject_id %in% (meta_data %>%
                                     dplyr::filter(planned_day_relative_to_boost==1) %>%
                                     dplyr::left_join(exp_data$pbmc_cell_frequency, by="specimen_id") %>%
                                     dplyr::filter(cell_type_name=="Monocytes") %>%
                                     tidyr::drop_na() %>%
                                     dplyr::pull(subject_id))
  ) %>%
  dplyr::group_by(dataset) %>%
  dplyr::summarise(task_possible_total = sum(task_possible), 
                   task_possible_fraction = mean(task_possible))

6.1.3 3) Gene Expression Tasks (CCL3 expression 3 days after booster)

There are 7 people for which this task does nto make sense

Code

meta_data %>%
  dplyr::filter(planned_day_relative_to_boost==3) %>%
  ggplot(aes(x=diff_relative_to_boost)) +
  geom_histogram(binwidth=1, color = "white", fill = "blue", alpha=0.1) +
  stat_bin(aes(label = after_stat(count)), binwidth=1, geom = "text", vjust = -0.5, color = "white", size = 3) +  
  ggdark::dark_mode(verbose=FALSE)

Code

all_genes <- unique(exp_data$pbmc_gene_expression$versioned_ensembl_gene_id)
ccl3_ensembl <- "ENSG00000277632"
ccl3_ensembl_versioned <- all_genes[str_starts(all_genes, ccl3_ensembl)]

meta_data %>%
  dplyr::select(subject_id, infancy_vac, dataset, biological_sex, ethnicity, race, year_of_birth) %>%
  dplyr::distinct() %>%
  dplyr::mutate(task_possible = 
                  subject_id %in% (meta_data %>%
                                     dplyr::filter(planned_day_relative_to_boost==3) %>%
                                     dplyr::left_join(exp_data$pbmc_gene_expression, by="specimen_id") %>%
                                     dplyr::filter(versioned_ensembl_gene_id==ccl3_ensembl_versioned) %>%
                                     tidyr::drop_na() %>%
                                     dplyr::pull(subject_id))
  ) %>%
  dplyr::group_by(dataset) %>%
  dplyr::summarise(task_possible_total = sum(task_possible), 
                   task_possible_fraction = mean(task_possible))

7 Missing Data

7.1 How many NA values do we have per assay?

Code

purrr::imap(exp_data, function(df, modality) {
  feature_col <- experimental_data_settings[[modality]]$feature_col
  value_col <- experimental_data_settings[[modality]]$value_col
  tibble(modality = modality, feature_col = feature_col, sum_na = sum(is.na(df[[value_col]])))
}) %>%
  dplyr::bind_rows()

7.2 Wide format

Code

purrr::imap_dfr(generate_wide_experimental_data(experimental_data=exp_data, 
                                                impute=NULL),
                  function(mtx, modality) {
                    rmeans <- rowMeans(is.na(mtx))
                    tibble(modality=modality, 
                           subject_id = names(rmeans), 
                           na_frac=rmeans)
           }) %>%
  dplyr::group_by(modality) %>%
  dplyr::summarise(mean_na_frac = mean(na_frac))

pbmc_cell_frequency | NA Fraction: 0.320140362330668 | Removed features: ASCs_(Plasmablasts), CD19_N_CD3_N_, CD19_P__CD3_N_, CD3_N_CD19_N_CD56_N_HLA_N_DR_P_CD14_N_CD16_N__cells, CD45_P_, CD56_P__CD3high_T_cells, Lineage_negative_cells_(CD3_N_CD19_N_CD56_N_HLA_N_DR_N_), TB_doublets, T_cells_(CD19_N_CD3_P_CD14_N_), CD33HLADR, CD3CD19, Tregs, mDC, MB_doublets_(CD19_P_CD14_P_), B_NK_doublets_(CD19_P_CD3_N_CD14_N_CD56_P_), T_M_doublets_(CD19_N_CD3_P_CD14_P_), Lineage__N__cells_(CD3_N_CD19_N_CD56_N_HLA_N_DR_N_), Antibody_secreting_B_cells_(ASCs), BNK_doublets, CD19_P_CD3_N_, CD3_N_CD19_N_, CD3_N_CD19_N_CD56_N_HLA_N_DR_N_CD14_N_CD16_N__cells, CD56_P_CD3high_T_cells, Lineage_N__cells_(CD3_N_CD19_N_CD56_N_HLA_N_DR_N_), MB_doublets, T_cells_(CD19_N_CD3_N_CD14_N_), TM_doublets

Warning: Values from `MFI` are not uniquely identified; output will contain list-cols.
* Use `values_fn = list` to suppress this warning.
* Use `values_fn = {summary_fun}` to summarise duplicates.
* Use the following dplyr code to identify duplicates.
  {data} |>
  dplyr::summarise(n = dplyr::n(), .by = c(specimen_id, isotype_antigen)) |>
  dplyr::filter(n > 1L)

plasma_cytokine_concentration_by_olink | NA Fraction: 0.700620342049714 | Removed features: Q969D9, Q96SB3, P16278, O75475, Q05516, Q9NWZ3, Q15661, P14317, Q9UHC6, Q6UXB4, Q00978, Q9UNE0, Q13574, Q8WTT0, P51617, Q9UMR7, Q06830, P30048, P09038, P30044, Q8N608, Q9C035, Q14203, P23229, Q15517, Q14435, Q96DB9, Q12933, P19474, Q8NHJ6, P34130, P08727, O43736, P50135, Q7Z6M3, Q9GZT9, Q12968, O60449, P63241, Q04637, P10747, Q03431, Q13490, P28845, P35240, Q9HCM2, Q9UQQ2, Q96P31, Q07065, P05412, O94992, Q8WXI8, Q04759, P16455, Q9NP99, P78310, P78362, Q13241, O14867, Q6ZUJ8, O43597, P52823, P27540, P58499, O60880, Q05084, O00273, Q96PD2, Q6DN72, O76036, P15514, Q8IU57, Q9UN19, Q9Y2J8, Q9Y3P8, P48740, Q9UQV4, Q9BXN2, Q6EIG7, O95786, P42701, Q92844, Q9UKX5, P52294, P18627, P05113, Q01151, P18564, P78410, Q07011, Q02763, P29965, P01583, Q9BZW8, Q15389, P49763, Q9Y653, O95727, P01732, Q16790, P01574, P00813, P01730, P29474, O00182, P35968, P25942, P20718, P49767, P01137, P09341, O43557, P01127, Q15116, P48023, Q14213,P29459, P09382, Q9NZQ7, P26842, P42830, P12544, P09601, P78423, P32970, Q9NP84, P55773, P06127, P09237, P07585, O75509, P43489, Q29983,Q29980, Q92583, P21246, Q14790, O75144, O43927, Q9BQ51, Q9HBE4, P78556, P10144, P29459,P29460, Q9H6B4, Q96JA1, O95502, P23526, P52888, P43234, Q96LA6, Q04900, P20711, Q9NPH0, Q03403, P25815, O15123, Q9Y5K6, O43827, Q9NY25, Q9GZM7, P35754, P09104, O95544, Q9UBU3, P50452, P35237, Q9HBB8, Q76M96, Q9NR28, Q8N1Q1, Q13275, O43240, Q9UKJ0, O95841, P51693, Q8IZP9, P19971, O75791, A6NI73, P00352, P40259, P09525, P50995, Q9Y286, P26010, P09417, O00161, P27695, O75356, Q9H4D0, P21964, Q15846, P51858, Q6WN34, P09668, Q15155, P16083, Q9BQB4, Q92520, Q8NBS9, P41236, Q9UHL4, Q86VZ4, Q9UHX3, Q6UWV6, Q8WTU2, Q8NI22, Q9BYZ8, Q8NBJ7, Q8WVQ1, P29017, P22466, P19022, Q06418, P46109, Q8WX77, Q9BZR6, P13611, P09467, P01222, P46379, Q92692, P05089, P40818, Q02790, P31431, Q9NWQ8, Q16773, P98082, Q9NQX5, Q641Q3, Q16820, Q01973, NT_N_proBNP, P12724

plasma_cytokine_concentration_by_olink | NA Fraction: 0.700620342049714 | Removed samples: 760, 750, 903, 824, 894, 833

t_cell_activation | NA Fraction: 0.0576923076923077 | Removed samples: 681

t_cell_polarization | NA Fraction: 0.0165394402035623 | Removed samples: 1159, 529, 534, 643, 648

7.3 What is the fraction of NA values in the PBMC Cell Type Frequencies

7.3.1 Per Cell Type

Code

exp_data$pbmc_cell_frequency %>%
  dplyr::mutate(is_na = is.na(percent_live_cell)) %>%
  dplyr::group_by(cell_type_name) %>%
  dplyr::summarise(frac_na = sum(is_na) / n()) %>%
  dplyr::arrange(desc(frac_na)) %>%
  dplyr::mutate(cell_type_name = factor(cell_type_name, levels=rev(cell_type_name))) %>%
  ggplot() +
  geom_col(aes(y=cell_type_name, x=frac_na)) +
  ggdark::dark_mode(verbose=FALSE) +
  labs(x="Fraction of NA Values per Cell Type")

7.3.2 Per Specimen

So there are 5 specimen with more than 20% NA, which are all from the same subject!

Code

exp_data$pbmc_cell_frequency %>%
  dplyr::mutate(is_na = is.na(percent_live_cell)) %>%
  dplyr::group_by(specimen_id) %>%
  dplyr::summarise(frac_na = sum(is_na) / n()) %>%
  ggplot(aes(x = frac_na)) +
  geom_histogram(bins = 20, color = "white", fill = "blue", alpha=0.1) +  # Add color for better visibility
  stat_bin(aes(label = after_stat(count)), bins = 20, geom = "text", 
           vjust = -0.5, color = "white", size = 3) +  # Display bin counts at the top
  ggdark::dark_mode(verbose=FALSE) +
  labs(x="Fraction of NA Values per Specimen")

7.4 What is the fraction of NA values in the Olink Data?

7.4.1 Per Protein

After excluding Q969D9, the largest fraction of NA values is about 5% which is acceptable I guess.

Code

exp_data$plasma_cytokine_concentration_by_olink %>%
  dplyr::mutate(is_na = is.na(concentration)) %>%
  dplyr::group_by(protein_id) %>%
  dplyr::summarise(frac_na = sum(is_na) / n()) %>%
  dplyr::arrange(desc(frac_na)) %>%
  dplyr::mutate(protein_id = factor(protein_id, levels=rev(protein_id))) %>%
  ggplot() +
  geom_col(aes(y=protein_id, x=frac_na)) +
  ggdark::dark_mode(verbose=FALSE) +
  labs(x="Fraction of NA Values per Protein")

7.4.2 Per Specimen

There are 6 specimen with more than 50% NA values, I am not sure whether I should exclude those?

Code

exp_data$plasma_cytokine_concentration_by_olink %>%
  dplyr::mutate(is_na = is.na(concentration)) %>%
  dplyr::group_by(specimen_id) %>%
  dplyr::summarise(frac_na = sum(is_na) / n()) %>%
  ggplot(aes(x = frac_na)) +
  geom_histogram(bins = 20, color = "white", fill = "blue", alpha=0.1) +  # Add color for better visibility
  stat_bin(aes(label = after_stat(count)), bins = 20, geom = "text", 
           vjust = -0.5, color = "white", size = 3) +  # Display bin counts at the top
  ggdark::dark_mode(verbose=FALSE) +
  labs(x="Fraction of NA Values per Specimen")

Code

exp_data$plasma_cytokine_concentration_by_olink %>%
  dplyr::mutate(is_na = is.na(concentration)) %>%
  dplyr::group_by(specimen_id) %>%
  dplyr::summarise(frac_na = sum(is_na) / n()) %>%
  dplyr::filter(frac_na > 0.2) %>%
  dplyr::left_join(meta_data, by="specimen_id") %>%
  dplyr::select(specimen_id, frac_na, subject_id, dataset, 
                actual_day_relative_to_boost, planned_day_relative_to_boost)

8 Different Units that are used

Code

purrr::map_lgl(exp_data, function(df) {
    if ("unit" %in% colnames(df)) {
      return(length(unique(df[["unit"]])) == 1)
    } else {
      return(TRUE)
    }
  })

                        pbmc_cell_frequency 
                                       TRUE 
                       pbmc_gene_expression 
                                       TRUE 
                            plasma_ab_titer 
                                      FALSE 
plasma_cytokine_concentration_by_legendplex 
                                       TRUE 
     plasma_cytokine_concentration_by_olink 
                                      FALSE 
                          t_cell_activation 
                                       TRUE 
                        t_cell_polarization 
                                       TRUE

Check if this is also the case in the harmonized data. Yes it is. So how to deal with this?

Code

harm_exp_data <- read_harmonized_experimental_data_depr(input_dir)

purrr::map_lgl(harm_exp_data, function(df) {
    if ("unit" %in% colnames(df)) {
      return(length(unique(df[["unit"]])) == 1)
    } else {
      return(TRUE)
    }
  })

                         pbmc_cell_frequency 
                                        TRUE 
                        pbmc_gene_expression 
                                        TRUE 
                      plasma_antibody_levels 
                                       FALSE 
plasma_cytokine_concentrations_by_legendplex 
                                        TRUE 
     plasma_cytokine_concentrations_by_olink 
                                       FALSE 
                           t_cell_activation 
                                        TRUE 
                         t_cell_polarization 
                                        TRUE

Code

rm(harm_exp_data)

8.1 plasma_ab_titer

See also: https://discuss.cmi-pb.org/t/multiple-units-in-the-ab-titer-table/57/3

Code

exp_data$plasma_ab_titer %>%
  dplyr::left_join(meta_data, by="specimen_id") %>%
  dplyr::count(dataset, unit)

8.2 plasma_cytokine_concentration_by_olink

Code

exp_data$plasma_cytokine_concentration_by_olink %>%
  dplyr::left_join(meta_data, by="specimen_id") %>%
  dplyr::count(dataset, unit)

9 Question for Assay Metadata

9.1 How does the quality control for Olink look like?

Code

exp_data$plasma_cytokine_concentration_by_olink %>%
  dplyr::count(quality_control)

Code

exp_data$plasma_cytokine_concentration_by_olink %>%
  dplyr::left_join(meta_data, by="specimen_id") %>%
  dplyr::count(partition, dataset)

Code

exp_data$plasma_cytokine_concentration_by_legendplex %>%
  dplyr::left_join(meta_data, by="specimen_id") %>%
  dplyr::count(partition, dataset)

9.2 How often is the Cytokine Concentration below the LOD in Olink

Code

# check how often is it below the limit of detection?
exp_data$plasma_cytokine_concentration_by_olink %>%
  dplyr::inner_join(meta_data, by="specimen_id") %>%
  dplyr::mutate(below_lod = concentration < lower_limit_of_quantitation) %>%
  dplyr::group_by(dataset) %>%
  dplyr::summarise(below_lod = mean(below_lod, na.rm=TRUE))

Code

set.seed(1)
p1 <- exp_data$plasma_cytokine_concentration_by_olink %>%
  dplyr::mutate(below_lod = concentration < lower_limit_of_quantitation) %>%
  dplyr::mutate(qc_warning = quality_control == "Warning") %>%
  dplyr::group_by(protein_id) %>%
  dplyr::summarize(mean_below_lod = mean(below_lod, na.rm=TRUE),
                   mean_qc_warning = mean(qc_warning, na.rm=TRUE)) %>%
  dplyr::left_join(cytokine_uniprot_mapping, by="protein_id") %>%
  ggplot() +
  geom_point(aes(x = mean_below_lod, y = mean_qc_warning)) +
  ggrepel::geom_text_repel(aes(x = mean_below_lod, y = mean_qc_warning, 
                      label = ifelse(mean_below_lod > 0.2, protein_id, '')),
                  max.overlaps = Inf, color="grey") + # This ensures all labels are shown
  ggdark::dark_mode()

p2 <- exp_data$plasma_cytokine_concentration_by_olink %>%
  dplyr::mutate(below_lod = concentration < lower_limit_of_quantitation) %>%
  dplyr::mutate(qc_warning = quality_control == "Warning") %>%
  dplyr::group_by(protein_id) %>%
  dplyr::summarize(mean_below_lod = mean(below_lod, na.rm=TRUE),
                   mean_qc_warning = mean(qc_warning, na.rm=TRUE)) %>%
  dplyr::left_join(cytokine_uniprot_mapping, by="protein_id") %>%
  ggplot() +
  geom_point(aes(x = mean_below_lod, y = mean_qc_warning)) +
  ggrepel::geom_text_repel(aes(x = mean_below_lod, y = mean_qc_warning, 
                      label = ifelse(mean_below_lod > 0.2, cytokine, '')),
                  max.overlaps = Inf, color="grey") + # This ensures all labels are shown
  ggdark::dark_mode()

cowplot::plot_grid(p1, p2, ncol=2)

Code

exp_data$plasma_cytokine_concentration_by_olink %>%
  dplyr::filter(protein_id %in% c("P60568"))

9.3 How often are the Antibody Titers below the LOD

Code

exp_data$plasma_ab_titer %>%
  dplyr::inner_join(meta_data, by="specimen_id") %>%
  dplyr::mutate(below_lod = MFI < lower_limit_of_detection) %>%
  dplyr::group_by(dataset) %>%
  dplyr::summarise(below_lod = mean(below_lod, na.rm=TRUE))

9.4 What is the fraction of antigen-specific antibody measurements

9.4.1 Per Isotype-Antigen

Code

exp_data$plasma_ab_titer %>%
  dplyr::group_by(isotype_antigen) %>%
  dplyr::summarize(is_antigen_specific_fraction = mean(is_antigen_specific)) %>%
  ggplot(aes(x=is_antigen_specific_fraction)) +
  geom_histogram(bins=30, color = "white", fill = "blue", alpha=0.1) +
  stat_bin(aes(label = after_stat(count)), bins=30, geom = "text", vjust = -0.5, color = "white", size = 3) +  
  ggdark::dark_mode(verbose=FALSE)

Code

exp_data$plasma_ab_titer %>%
  dplyr::group_by(isotype_antigen) %>%
  dplyr::summarize(is_antigen_specific_fraction = mean(is_antigen_specific)) %>%
  dplyr::arrange(is_antigen_specific_fraction)

So the top entry makes sense, IgE_Total does not refer to any specific antigen

9.4.2 Per Specimen

So in some specimen we have non-specific antibody measurements? What does that mean?

Code

exp_data$plasma_ab_titer %>%
  dplyr::group_by(specimen_id) %>%
  dplyr::summarize(is_antigen_specific_fraction = mean(is_antigen_specific)) %>%
  ggplot(aes(x=is_antigen_specific_fraction)) +
  geom_histogram(binwidth=1, color = "white", fill = "blue", alpha=0.1) +
  stat_bin(aes(label = after_stat(count)), binwidth=1, geom = "text", vjust = -0.5, color = "white", size = 3) +  
  ggdark::dark_mode(verbose=FALSE)

9.4.3 Per Year

So this is very weird. For 2021, no antibody measurement is antigen-specific? Maybe this is a bug!

Code

exp_data$plasma_ab_titer %>%
  dplyr::left_join(meta_data, by="specimen_id") %>%
  dplyr::group_by(dataset) %>%
  dplyr::summarize(mean_is_antigen_specific = mean(is_antigen_specific))

10 Hierarchy in PBMC Frequency Data

10.1 NK

There are 3 types of NK cells:

NK cells (CD3-CD19-CD56+): CD19-CD3-CD56+/++
NK: CD19-CD3-CD56+
CD56high NK cells: CD19-CD3-CD56++

And I don’t know what to do with that information!

10.2 No Gating Info for 2023

I just assume it is the same as in 2022, but this might not be true since the data look so different

10.3 No Gating Info for Basophils and CD19 in 2020

I just assume it is the same as in 2021 and 2022

10.4 No Gating Info for non-pDCs at all

Not sure how to fix this)

Code

exp_data$pbmc_cell_frequency %>%
  dplyr::left_join((meta_data %>%
                      dplyr::select(specimen_id, dataset) %>%
                      dplyr::distinct()),
                   by="specimen_id") %>%
  dplyr::left_join((celltype_meta %>%
                      dplyr::select(cell_type_name, dataset, level) %>%
                      dplyr::distinct()),
                    by=c("dataset", "cell_type_name")) %>%
  dplyr::filter(is.na(level)) %>%
  dplyr::select(dataset, cell_type_name) %>%
  dplyr::distinct()

10.5 Should the Proportions sum up to 1 at level 0?

Trying to take the hierarchical information into account, it still does not make sense

Code

exp_data$pbmc_cell_frequency %>%
  dplyr::left_join((meta_data %>%
                      dplyr::select(specimen_id, dataset) %>%
                      dplyr::distinct()),
                   by="specimen_id") %>%
  dplyr::left_join((celltype_meta %>%
                      dplyr::select(cell_type_name, dataset, level) %>%
                      dplyr::distinct()),
                    by=c("dataset", "cell_type_name")) %>%
  dplyr::filter(level == 0) %>%
  dplyr::group_by(specimen_id) %>%
  dplyr::summarize(percent_live_cell_sum = sum(percent_live_cell, na.rm=TRUE)) %>%
  dplyr::left_join(meta_data, by="specimen_id") %>%
  ggplot() +
  geom_violin(aes(x=dataset, y=percent_live_cell_sum)) +
  ggdark::dark_mode(verbose=FALSE)

10.6 Difference in 2023 Dataset

It seems like the 2023 data are very very different, which makes me hesitant to use it for any kind of predictions on the challenge data. E.g. just looking at level 0 which should make sense I get the following

Code

# first check for NAs for the levels
exp_data$pbmc_cell_frequency %>%
  dplyr::left_join((meta_data %>%
                      dplyr::select(specimen_id, dataset) %>%
                      dplyr::distinct()),
                   by="specimen_id") %>%
  dplyr::left_join((celltype_meta %>%
                      dplyr::select(cell_type_name, dataset, level) %>%
                      dplyr::distinct()),
                    by=c("dataset", "cell_type_name")) %>%
  dplyr::filter(is.na(level))

Code

exp_data$pbmc_cell_frequency %>%
  dplyr::left_join((meta_data %>%
                      dplyr::select(specimen_id, dataset) %>%
                      dplyr::distinct()),
                   by="specimen_id") %>%
  dplyr::left_join((celltype_meta %>%
                      dplyr::select(cell_type_name, dataset, level) %>%
                      dplyr::distinct()),
                    by=c("dataset", "cell_type_name")) %>%
  dplyr::filter(level == 0) %>%
  dplyr::group_by(specimen_id) %>%
  dplyr::summarize(percent_live_cell_sum = sum(percent_live_cell, na.rm=TRUE)) %>%
  dplyr::left_join(meta_data, by="specimen_id") %>%
  ggplot() +
  geom_violin(aes(x=dataset, y=percent_live_cell_sum)) +
  ggdark::dark_mode(verbose=FALSE)

Especially, when completely ignoring the hierarchy in the gating information, I would expect to have more than 100% in percelt_live_cell_sum, but this is not true for the specimens from the 2023 dataset

Code

exp_data$pbmc_cell_frequency %>%
  dplyr::left_join((meta_data %>%
                      dplyr::select(specimen_id, dataset) %>%
                      dplyr::distinct()),
                   by="specimen_id") %>%
  dplyr::left_join((celltype_meta %>%
                      dplyr::select(cell_type_name, dataset, level) %>%
                      dplyr::distinct()),
                    by=c("dataset", "cell_type_name")) %>%
  dplyr::group_by(specimen_id) %>%
  dplyr::summarize(percent_live_cell_sum = sum(percent_live_cell, na.rm=TRUE)) %>%
  dplyr::left_join(meta_data, by="specimen_id") %>%
  ggplot() +
  geom_violin(aes(x=dataset, y=percent_live_cell_sum)) +
  ggdark::dark_mode(verbose=FALSE)

Or what I use a list of predefined cell types of interest?

11 Features for T cell Activation Assay (AIM)

Code

exp_data$t_cell_activation %>%
  dplyr::select(stimulation) %>%
  dplyr::distinct()

Code

exp_data$t_cell_activation %>%
  ggplot() +
  geom_histogram(aes(x=analyte_percentages), bins=100) +
  facet_wrap(~stimulation) +
  ggdark::dark_mode()

12 Features for T cell Polarization Assay (FluoroSpot)

Code

exp_data$t_cell_polarization %>%
  dplyr::left_join(protein_meta, by=c("protein_id"="uniprot_id")) %>%
  dplyr::select(protein_id, cytokine) %>%
  dplyr::distinct()

Code

exp_data$t_cell_polarization %>%
  dplyr::select(stimulation) %>%
  dplyr::distinct()

where

DMSO: “Negative” Control (solvent where peptides are dissolved in)
PHA (Phytohaemagglutinin): “Postive” Control (this should elicit strong T cell response?)
PT: Bordetella Pertussis Toxin

Code

exp_data$t_cell_polarization %>%
  dplyr::left_join(protein_meta, by=c("protein_id"="uniprot_id")) %>%
  ggplot() +
  geom_histogram(aes(x=analyte_counts), bins=100) +
  facet_grid(rows=vars(stimulation), cols=vars(cytokine)) +
  scale_x_log10() +
  ggdark::dark_mode()

Code

# TODO: the assay has been normalized relative to DMSO
exp_data$t_cell_polarization %>%
  dplyr::filter(stimulation=="DMSO") %>%
  dplyr::pull(analyte_counts) %>%
  dplyr::near(., 1) %>%
  mean()

[1] 0.997416

Code

exp_data$t_cell_polarization %>%
  dplyr::filter(stimulation!="DMSO") %>%
  dplyr::pull(stimulation_protein_id) %>%
  unique()

[1] "PHA_P01579" "PHA_Q16552" "PHA_P05113" "PT_P01579"  "PT_Q16552" 
[6] "PT_P05113"

13 Other Notes

Only from 2022 onwards do we have time-course baseline measurements, i.e. -30, -14/15, 0

Code

meta_data %>%
  dplyr::count(partition, dataset, planned_day_relative_to_boost) %>%
  dplyr::filter(planned_day_relative_to_boost < 0)

14 Appendix

Code

sessionInfo()

R version 4.2.3 (2023-03-15)
Platform: aarch64-apple-darwin20 (64-bit)
Running under: macOS 14.6.1

Matrix products: default
BLAS:   /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/lib/libRblas.0.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/lib/libRlapack.dylib

locale:
[1] C

attached base packages:
[1] grid      stats     graphics  grDevices datasets  utils     methods  
[8] base     

other attached packages:
 [1] circlize_0.4.16       ComplexHeatmap_2.15.4 magick_2.8.3         
 [4] FactoMineR_2.11       factoextra_1.0.7      ggdark_0.2.1         
 [7] lubridate_1.9.3       forcats_1.0.0         stringr_1.5.1        
[10] dplyr_1.1.4           purrr_1.0.2           readr_2.1.5          
[13] tidyr_1.3.1           tibble_3.2.1          ggplot2_3.5.1        
[16] tidyverse_2.0.0      

loaded via a namespace (and not attached):
 [1] splines_4.2.3        bit64_4.5.2          vroom_1.6.5         
 [4] jsonlite_1.8.9       foreach_1.5.2        BiocManager_1.30.25 
 [7] stats4_4.2.3         renv_1.0.11          yaml_2.3.10         
[10] ggrepel_0.9.6        pillar_1.9.0         lattice_0.22-6      
[13] glue_1.8.0           digest_0.6.37        RColorBrewer_1.1-3  
[16] colorspace_2.1-1     Matrix_1.6-5         cowplot_1.1.3       
[19] htmltools_0.5.8.1    pkgconfig_2.0.3      GetoptLong_1.0.5    
[22] xtable_1.8-4         mvtnorm_1.3-1        scales_1.3.0        
[25] tzdb_0.4.0           timechange_0.3.0     emmeans_1.10.5      
[28] mgcv_1.9-1           farver_2.1.2         generics_0.1.3      
[31] IRanges_2.32.0       DT_0.33              withr_3.0.2         
[34] BiocGenerics_0.44.0  cli_3.6.3            magrittr_2.0.3      
[37] crayon_1.5.3         estimability_1.5.1   evaluate_1.0.1      
[40] fansi_1.0.6          nlme_3.1-166         doParallel_1.0.17   
[43] MASS_7.3-60.0.1      tools_4.2.3          hms_1.1.3           
[46] GlobalOptions_0.1.2  lifecycle_1.0.4      matrixStats_1.4.1   
[49] S4Vectors_0.36.2     munsell_0.5.1        cluster_2.1.6       
[52] flashClust_1.01-2    compiler_4.2.3       multcompView_0.1-10 
[55] rlang_1.1.4          iterators_1.0.14     rjson_0.2.23        
[58] htmlwidgets_1.6.4    leaps_3.2            labeling_0.4.3      
[61] rmarkdown_2.29       gtable_0.3.6         codetools_0.2-20    
[64] R6_2.5.1             knitr_1.49           bit_4.5.0           
[67] fastmap_1.2.0        utf8_1.2.4           clue_0.3-65         
[70] shape_1.4.6.1        stringi_1.8.4        parallel_4.2.3      
[73] Rcpp_1.0.13-1        vctrs_0.6.5          png_0.1-8           
[76] scatterplot3d_0.3-44 tidyselect_1.2.1     xfun_0.49           
[79] coda_0.19-4.1