Data Overview

Author

Philipp Sven Lars Schäfer

Published

November 28, 2024

1 Packages

Code

suppressPackageStartupMessages({
  library(tidyverse)
  library(ggdark)
  library(factoextra)
  library(FactoMineR)
  library(magick)
  library(ComplexHeatmap)
  library(circlize)
  source(file.path("..", "src", "read_data.R"))
  source(file.path("..", "src", "colors.R"))
})

2 Data

Code

input_dir = file.path("..", "data")

Code

meta_data <- read_harmonized_meta_data(input_dir)
gene_meta <- read_gene_meta(input_dir)

#exp_data <- read_harmonized_experimental_data(input_dir)
exp_data <- read_raw_experimental_data(input_dir)
exp_data <- filter_experimental_data(meta_data, exp_data, gene_meta=gene_meta)

pbmc_cell_frequency | Removed 550 specimens because missing in meta data

plasma_ab_titer | Removed 6931 specimens because missing in meta data

plasma_cytokine_concentration_by_olink | Removed 495 specimens because missing in meta data

pbmc_cell_frequency | Removed 56 features because not in feature subset

plasma_ab_titer | Removed 48 features because not in feature subset

plasma_cytokine_concentration_by_olink | Removed 234 features because not in feature subset

t_cell_polarization | Removed 3 features because not in feature subset

plasma_cytokine_concentration_by_olink | Removed 300 features because qc warning

plasma_ab_titer | Removed 10540 measurements because wrong unit used

plasma_cytokine_concentration_by_olink | Removed 2400 measurements because wrong unit used

plasma_cytokine_concentration_by_legendplex | Removed 8 because specimen is outlier

plasma_cytokine_concentration_by_olink | Removed specimen 750, 760, 824, 833, 894, 903 because fraction of measurements below LOQ > 50%

plasma_ab_titer | Removed specimen 674, 675, 676 because fraction of measurements below LOD > 50%

Code

celltype_meta <- read_celltype_meta(input_dir)
gene_meta <- read_gene_meta(input_dir)
protein_meta <- read_protein_meta(input_dir)

3 Checks

Code

# only works due to filtering step above
stopifnot(all(
  purrr::map_lgl(exp_data, ~ all(.x$specimen_id %in% meta_data$specimen_id)))
)

# no subject is recorded in more than one dataset
(meta_data %>%
  dplyr::count(subject_id, dataset) %>%
  dplyr::count(subject_id) %>%
  dplyr::pull(n) == 1) %>%
  all() %>%
  stopifnot()

# we have the baseline specimen (planned_day_relative_to_boost = 0) for every subject
(meta_data %>%
  dplyr::select(subject_id, planned_day_relative_to_boost, dataset) %>%
  dplyr::distinct() %>%
  dplyr::mutate(is_baseline = (planned_day_relative_to_boost==0)) %>%
  dplyr::group_by(subject_id) %>%
  dplyr::summarize(baseline_present = any(is_baseline),
                   dataset = first(dataset)) %>%
  dplyr::group_by(dataset) %>%
  dplyr::summarize(baseline_present_frac = mean(baseline_present)) %>%
  pull(baseline_present_frac) == 1) %>%
  all() %>%
  stopifnot()

stopifnot(all(
  purrr::map_lgl(exp_data, function(df) {
    if ("unit" %in% colnames(df)) {
      return(length(unique(df[["unit"]])) == 1)
    } else {
      return(TRUE)
    }
  })
))

4 Demography & Metadata

4.1 How many subjects do we have per dataset / per partition?

Code

meta_data %>%
  dplyr::select(subject_id, dataset, partition, infancy_vac) %>%
  dplyr::distinct() %>%
  dplyr::count(dataset, partition, infancy_vac) %>%
  ggplot(aes(x = n, y = dataset, fill = infancy_vac, color = partition)) +
  geom_col(position = position_dodge(width = 0.9), width = 0.7, alpha=0.5) +  
  geom_text(aes(label = n), 
            position = position_dodge(width = 0.9),  
            hjust = -0.2, vjust = 0.5, size = 3, color = "white") +
  scale_fill_manual(values = infancy_vac_colors) +
  scale_color_manual(values = partition_colors) +
  ggdark::dark_mode(verbose = FALSE)

4.2 What is the age range

Code

meta_data %>%
  ggplot() + 
  geom_histogram(aes(x=age_at_boost, fill=dataset, y=after_stat(density)), 
                 color="black", position="identity", bins=30) +
  facet_wrap(~dataset) +
  scale_fill_manual(values=dataset_colors) +
  ggdark::dark_mode(verbose=FALSE)

5 Assays & Timepoints per Subject / Dataset

5.1 What is the difference between planned and actual booster administration

Code

meta_data %>%
  ggplot() +
  geom_histogram(aes(x=diff_relative_to_boost), binwidth=1) +
  facet_wrap(~dataset) +
  ggdark::dark_mode(verbose=FALSE)

5.2 What is the difference between planned and actual for the baseline

Code

meta_data %>%
  dplyr::filter(planned_day_relative_to_boost==0) %>%
  dplyr::mutate(`diff_>_15` = abs(actual_day_relative_to_boost) > 15) %>%
  ggplot() +
  geom_histogram(aes(x=actual_day_relative_to_boost, fill=`diff_>_15`), binwidth=1) +
  facet_wrap(~dataset) +
  ggdark::dark_mode(verbose=FALSE)

5.3 How many time points do we have per subject?

Code

meta_data %>%
  dplyr::count(dataset, subject_id) %>%
  ggplot() +
  geom_histogram(aes(x=n, fill=dataset),color="black", binwidth=1) +
  facet_wrap(~dataset, ncol=2) +
  ggdark::dark_mode(verbose=FALSE) +
  scale_x_continuous(breaks=seq(0, 8, 1)) +
  scale_fill_manual(values=dataset_colors)

5.4 How many assays do we have for the baseline measurement?

Code

assays_per_specimen <- purrr::imap(exp_data, ~ .x %>% 
              dplyr::select(specimen_id) %>%
              dplyr::distinct() %>%
              dplyr::mutate(assay=.y)) %>%
  dplyr::bind_rows() %>%
  dplyr::left_join(meta_data, by="specimen_id") %>%
  dplyr::select(specimen_id, assay, subject_id, planned_day_relative_to_boost, infancy_vac, dataset)

assays_per_specimen %>%
  dplyr::count(subject_id, planned_day_relative_to_boost, dataset, specimen_id) %>%
  dplyr::filter(!is.na(subject_id)) %>%
  dplyr::filter(planned_day_relative_to_boost==0) %>%
  ggplot() +
  geom_histogram(aes(x=n, fill=dataset), color="black", binwidth=1) +
  facet_wrap(~dataset, ncol=2) +
  ggdark::dark_mode(verbose=FALSE) +
  scale_x_continuous(breaks=seq(0, 8, 1)) +
  scale_fill_manual(values=dataset_colors)

Code

for (day in c(0, 1, 3, 14)) {
  
annotation_data <- meta_data %>%
  dplyr::select(specimen_id, planned_day_relative_to_boost, infancy_vac, 
                dataset, biological_sex) %>%
  dplyr::filter(specimen_id %in% assays_per_specimen$specimen_id) %>%
  dplyr::filter(planned_day_relative_to_boost == !!day) %>% 
  dplyr::select(- planned_day_relative_to_boost) %>%
  dplyr::arrange(dataset, infancy_vac, biological_sex) %>%
  tibble::column_to_rownames("specimen_id") %>%
  dplyr::select(dataset, infancy_vac, biological_sex)

heatmap_data <- assays_per_specimen %>%
  dplyr::select(specimen_id, assay) %>%
  dplyr::mutate(value = 1) %>%
  tidyr::pivot_wider(names_from="assay", values_from="value") %>%
  mutate(across(everything(), .fns = ~replace_na(.,0))) %>%
  tibble::column_to_rownames("specimen_id") %>%
  as.matrix() %>%
  t()

colnames(heatmap_data) <- as.character(colnames(heatmap_data))
heatmap_data <- heatmap_data[, rownames(annotation_data)]

# Create the heatmap
ht <- Heatmap(heatmap_data, 
              name = "heatmap", 
              row_title = "", 
              column_title = paste0("Specimens for Planned Day ", day),
              col = colorRamp2(c(0, 1), c("white", "black")),
              cluster_rows = FALSE,
              cluster_columns = FALSE,
              show_row_names = TRUE, 
              show_column_names = FALSE,
              width = unit(16, "cm"),
              top_annotation = ComplexHeatmap::HeatmapAnnotation(df = annotation_data,
                                    col = list(
                                      "dataset" = dataset_colors,
                                      "infancy_vac" = infancy_vac_colors,
                                      "biological_sex" = sex_colors
                                    ),
                                    which="column")
)

draw(ht, 
     heatmap_legend_side = "top",
     annotation_legend_side = "top",
     show_heatmap_legend = FALSE # don't show colorbar
     )
}

6 Training Data

6.1 How many subjects can we use to construct training data?

6.1.1 1) Antibody Level Tasks (anti-PT 14 days after booster)

All but 7 people were assayed approximately 14 days after the booster administration

Code

meta_data %>%
  dplyr::filter(planned_day_relative_to_boost==14) %>%
  ggplot(aes(x=diff_relative_to_boost)) +
  geom_histogram(binwidth=1, color = "white", fill = "blue", alpha=0.1) +
  stat_bin(aes(label = after_stat(count)), binwidth=1, geom = "text", vjust = -0.5, color = "white", size = 3) +  
  ggdark::dark_mode(verbose=FALSE)

Code

meta_data %>%
  dplyr::select(subject_id, infancy_vac, dataset, biological_sex, ethnicity, race, year_of_birth) %>%
  dplyr::distinct() %>%
  dplyr::mutate(task_1_possible = 
                  subject_id %in% (meta_data %>%
                                     dplyr::filter(planned_day_relative_to_boost==14) %>%
                                     dplyr::left_join(exp_data$plasma_ab_titer,
                                                      by="specimen_id") %>%
                                     dplyr::filter(isotype_antigen=="IgG_PT") %>%
                                     tidyr::drop_na() %>%
                                     dplyr::pull(subject_id))
  ) %>%
  dplyr::group_by(dataset) %>%
  dplyr::summarise(task_1_possible_total = sum(task_1_possible), 
                   task_1_possible_fraction = mean(task_1_possible))

6.1.2 2) Cell Frequency Tasks (Monocytes 1 day after booster)

But there is one person where it does not really make sense

Code

meta_data %>%
  dplyr::filter(planned_day_relative_to_boost==1) %>%
  ggplot(aes(x=diff_relative_to_boost)) +
  geom_histogram(binwidth=1, color = "white", fill = "blue", alpha=0.1) +
  stat_bin(aes(label = after_stat(count)), binwidth=1, geom = "text", vjust = -0.5, color = "white", size = 3) +  
  ggdark::dark_mode(verbose=FALSE)

Code

meta_data %>%
  dplyr::select(subject_id, infancy_vac, dataset, biological_sex, ethnicity, race, year_of_birth) %>%
  dplyr::distinct() %>%
  dplyr::mutate(task_possible = 
                  subject_id %in% (meta_data %>%
                                     dplyr::filter(planned_day_relative_to_boost==1) %>%
                                     dplyr::left_join(exp_data$pbmc_cell_frequency, by="specimen_id") %>%
                                     dplyr::filter(cell_type_name=="Monocytes") %>%
                                     tidyr::drop_na() %>%
                                     dplyr::pull(subject_id))
  ) %>%
  dplyr::group_by(dataset) %>%
  dplyr::summarise(task_possible_total = sum(task_possible), 
                   task_possible_fraction = mean(task_possible))

6.1.3 3) Gene Expression Tasks (CCL3 expression 3 days after booster)

There are 7 people for which this task does nto make sense

Code

meta_data %>%
  dplyr::filter(planned_day_relative_to_boost==3) %>%
  ggplot(aes(x=diff_relative_to_boost)) +
  geom_histogram(binwidth=1, color = "white", fill = "blue", alpha=0.1) +
  stat_bin(aes(label = after_stat(count)), binwidth=1, geom = "text", vjust = -0.5, color = "white", size = 3) +  
  ggdark::dark_mode(verbose=FALSE)

Code

all_genes <- unique(exp_data$pbmc_gene_expression$versioned_ensembl_gene_id)
ccl3_ensembl <- "ENSG00000277632"
ccl3_ensembl_versioned <- all_genes[str_starts(all_genes, ccl3_ensembl)]

meta_data %>%
  dplyr::select(subject_id, infancy_vac, dataset, biological_sex, ethnicity, race, year_of_birth) %>%
  dplyr::distinct() %>%
  dplyr::mutate(task_possible = 
                  subject_id %in% (meta_data %>%
                                     dplyr::filter(planned_day_relative_to_boost==3) %>%
                                     dplyr::left_join(exp_data$pbmc_gene_expression, by="specimen_id") %>%
                                     dplyr::filter(versioned_ensembl_gene_id==ccl3_ensembl_versioned) %>%
                                     tidyr::drop_na() %>%
                                     dplyr::pull(subject_id))
  ) %>%
  dplyr::group_by(dataset) %>%
  dplyr::summarise(task_possible_total = sum(task_possible), 
                   task_possible_fraction = mean(task_possible))

7 Missing Data

7.1 How many NA values do we have per assay?

Code

purrr::imap(exp_data, function(df, modality) {
  feature_col <- experimental_data_settings[[modality]]$feature_col
  value_col <- experimental_data_settings[[modality]]$value_col
  tibble(modality = modality, feature_col = feature_col, sum_na = sum(is.na(df[[value_col]])))
}) %>%
  dplyr::bind_rows()

7.2 Wide format

Code

purrr::imap_dfr(generate_wide_experimental_data(experimental_data=exp_data, 
                                                impute=NULL),
                  function(mtx, modality) {
                    rmeans <- rowMeans(is.na(mtx))
                    tibble(modality=modality, 
                           subject_id = names(rmeans), 
                           na_frac=rmeans)
           }) %>%
  dplyr::group_by(modality) %>%
  dplyr::summarise(mean_na_frac = mean(na_frac))

t_cell_activation | NA Fraction: 0.0576923076923077 | Removed samples: 681

7.3 What is the fraction of NA values in the PBMC Cell Type Frequencies

With the current selection of cell types, there is no specimen with NA values.

7.3.1 Per Cell Type

Code

exp_data$pbmc_cell_frequency %>%
  dplyr::mutate(is_na = is.na(percent_live_cell)) %>%
  dplyr::group_by(cell_type_name) %>%
  dplyr::summarise(frac_na = sum(is_na) / n()) %>%
  dplyr::arrange(desc(frac_na)) %>%
  dplyr::mutate(cell_type_name = factor(cell_type_name, levels=rev(cell_type_name))) %>%
  ggplot() +
  geom_col(aes(y=cell_type_name, x=frac_na)) +
  ggdark::dark_mode(verbose=FALSE) +
  labs(x="Fraction of NA Values per Cell Type")

7.3.2 Per Cell Specimen

Code

exp_data$pbmc_cell_frequency %>%
  dplyr::mutate(is_na = is.na(percent_live_cell)) %>%
  dplyr::group_by(specimen_id) %>%
  dplyr::summarise(frac_na = sum(is_na) / n()) %>%
  ggplot(aes(x = frac_na)) +
  geom_histogram(bins = 20, color = "white", fill = "blue", alpha=0.1) +  # Add color for better visibility
  stat_bin(aes(label = after_stat(count)), bins = 20, geom = "text", 
           vjust = -0.5, color = "white", size = 3) +  # Display bin counts at the top
  ggdark::dark_mode(verbose=FALSE) +
  labs(x="Fraction of NA Values per Specimen")

7.4 What is the fraction of NA values in the Olink Data?

7.4.1 Per Protein

After excluding Q969D9, the largest fraction of NA values is about 5% which is acceptable I guess.

Code

exp_data$plasma_cytokine_concentration_by_olink %>%
  dplyr::mutate(is_na = is.na(concentration)) %>%
  dplyr::group_by(protein_id) %>%
  dplyr::summarise(frac_na = sum(is_na) / n()) %>%
  dplyr::arrange(desc(frac_na)) %>%
  dplyr::mutate(protein_id = factor(protein_id, levels=rev(protein_id))) %>%
  ggplot() +
  geom_col(aes(y=protein_id, x=frac_na)) +
  ggdark::dark_mode(verbose=FALSE) +
  labs(x="Fraction of NA Values per Protein")

7.4.2 Per Specimen

There are 6 specimen with more than 50% NA values, I am not sure whether I should exlcude those?

Code

exp_data$plasma_cytokine_concentration_by_olink %>%
  dplyr::mutate(is_na = is.na(concentration)) %>%
  dplyr::group_by(specimen_id) %>%
  dplyr::summarise(frac_na = sum(is_na) / n()) %>%
  ggplot(aes(x = frac_na)) +
  geom_histogram(bins = 20, color = "white", fill = "blue", alpha=0.1) +  # Add color for better visibility
  stat_bin(aes(label = after_stat(count)), bins = 20, geom = "text", 
           vjust = -0.5, color = "white", size = 3) +  # Display bin counts at the top
  ggdark::dark_mode(verbose=FALSE) +
  labs(x="Fraction of NA Values per Specimen")

Code

exp_data$plasma_cytokine_concentration_by_olink %>%
  dplyr::mutate(is_na = is.na(concentration)) %>%
  dplyr::group_by(specimen_id) %>%
  dplyr::summarise(frac_na = sum(is_na) / n()) %>%
  dplyr::filter(frac_na > 0.2) %>%
  dplyr::left_join(meta_data, by="specimen_id") %>%
  dplyr::select(specimen_id, frac_na, subject_id, dataset, 
                actual_day_relative_to_boost, planned_day_relative_to_boost)

8 Question for Assay Metadata

8.1 How does the quality control for Olink look like?

Code

exp_data$plasma_cytokine_concentration_by_olink %>%
  dplyr::count(quality_control)

Code

exp_data$plasma_cytokine_concentration_by_olink %>%
  dplyr::left_join(meta_data, by="specimen_id") %>%
  dplyr::count(partition, dataset)

Code

exp_data$plasma_cytokine_concentration_by_legendplex %>%
  dplyr::left_join(meta_data, by="specimen_id") %>%
  dplyr::count(partition, dataset)

8.2 How often is the Cytokine Concentration below the LOD in Olink

Code

# check how often is it below the limit of detection?
exp_data$plasma_cytokine_concentration_by_olink %>%
  dplyr::mutate(below_lod = concentration < lower_limit_of_quantitation) %>%
  dplyr::count(below_lod)

Code

set.seed(1)
p1 <- exp_data$plasma_cytokine_concentration_by_olink %>%
  dplyr::mutate(below_lod = concentration < lower_limit_of_quantitation) %>%
  dplyr::mutate(qc_warning = quality_control == "Warning") %>%
  dplyr::group_by(protein_id) %>%
  dplyr::summarize(mean_below_lod = mean(below_lod, na.rm=TRUE),
                   mean_qc_warning = mean(qc_warning, na.rm=TRUE)) %>%
  dplyr::left_join(cytokine_uniprot_mapping, by="protein_id") %>%
  ggplot() +
  geom_point(aes(x = mean_below_lod, y = mean_qc_warning)) +
  ggrepel::geom_text_repel(aes(x = mean_below_lod, y = mean_qc_warning, 
                      label = ifelse(mean_below_lod > 0.2, protein_id, '')),
                  max.overlaps = Inf, color="grey") + # This ensures all labels are shown
  ggdark::dark_mode()

p2 <- exp_data$plasma_cytokine_concentration_by_olink %>%
  dplyr::mutate(below_lod = concentration < lower_limit_of_quantitation) %>%
  dplyr::mutate(qc_warning = quality_control == "Warning") %>%
  dplyr::group_by(protein_id) %>%
  dplyr::summarize(mean_below_lod = mean(below_lod, na.rm=TRUE),
                   mean_qc_warning = mean(qc_warning, na.rm=TRUE)) %>%
  dplyr::left_join(cytokine_uniprot_mapping, by="protein_id") %>%
  ggplot() +
  geom_point(aes(x = mean_below_lod, y = mean_qc_warning)) +
  ggrepel::geom_text_repel(aes(x = mean_below_lod, y = mean_qc_warning, 
                      label = ifelse(mean_below_lod > 0.2, cytokine, '')),
                  max.overlaps = Inf, color="grey") + # This ensures all labels are shown
  ggdark::dark_mode()

cowplot::plot_grid(p1, p2, ncol=2)

8.3 What is the fraction of unspecific antibody measurements

8.3.1 Per Isotype-Antigen

Code

exp_data$plasma_ab_titer %>%
  dplyr::group_by(isotype_antigen) %>%
  dplyr::summarize(is_antigen_specific_fraction = mean(is_antigen_specific)) %>%
  ggplot(aes(x=is_antigen_specific_fraction)) +
  geom_histogram(bins=30, color = "white", fill = "blue", alpha=0.1) +
  stat_bin(aes(label = after_stat(count)), bins=30, geom = "text", vjust = -0.5, color = "white", size = 3) +  
  ggdark::dark_mode(verbose=FALSE)

8.3.2 Per Specimen

So in some specimen we have non-specific antibody measurements? What does that mean?

Code

exp_data$plasma_ab_titer %>%
  dplyr::group_by(specimen_id) %>%
  dplyr::summarize(is_antigen_specific_fraction = mean(is_antigen_specific)) %>%
  ggplot(aes(x=is_antigen_specific_fraction)) +
  geom_histogram(binwidth=1, color = "white", fill = "blue", alpha=0.1) +
  stat_bin(aes(label = after_stat(count)), binwidth=1, geom = "text", vjust = -0.5, color = "white", size = 3) +  
  ggdark::dark_mode(verbose=FALSE)

9 Hierarchy in PBMC Frequency Data

9.1 NK

There are 3 types of NK cells:

NK cells (CD3-CD19-CD56+): CD19-CD3-CD56+/++
NK: CD19-CD3-CD56+
CD56high NK cells: CD19-CD3-CD56++

And I don’t know what to do with that information!

9.2 No Gating Info for 2023

I just assume it is the same as in 2022, but this might not be true since the data look so different

9.3 No Gating Info for Basophils and CD19 in 2020

I just assume it is the same as in 2021 and 2022

9.4 No Gating Info for non-pDCs at all

Not sure how to fix this)

Code

exp_data$pbmc_cell_frequency %>%
  dplyr::left_join((meta_data %>%
                      dplyr::select(specimen_id, dataset) %>%
                      dplyr::distinct()),
                   by="specimen_id") %>%
  dplyr::left_join((celltype_meta %>%
                      dplyr::select(cell_type_name, dataset, level) %>%
                      dplyr::distinct()),
                    by=c("dataset", "cell_type_name")) %>%
  dplyr::filter(is.na(level)) %>%
  dplyr::select(dataset, cell_type_name) %>%
  dplyr::distinct()

9.5 Should the Proportions sum up to 1 at level 0?

Trying to take the hierarchical information into account, it still does not make sense

Code

exp_data$pbmc_cell_frequency %>%
  dplyr::left_join((meta_data %>%
                      dplyr::select(specimen_id, dataset) %>%
                      dplyr::distinct()),
                   by="specimen_id") %>%
  dplyr::left_join((celltype_meta %>%
                      dplyr::select(cell_type_name, dataset, level) %>%
                      dplyr::distinct()),
                    by=c("dataset", "cell_type_name")) %>%
  dplyr::filter(level == 0) %>%
  dplyr::group_by(specimen_id) %>%
  dplyr::summarize(percent_live_cell_sum = sum(percent_live_cell, na.rm=TRUE)) %>%
  dplyr::left_join(meta_data, by="specimen_id") %>%
  ggplot() +
  geom_violin(aes(x=dataset, y=percent_live_cell_sum)) +
  ggdark::dark_mode(verbose=FALSE)

9.6 Difference in 2023 Dataset

It seems like the 2023 data are very very different, which makes me hesitant to use it for any kind of predictions on the challenge data. E.g. just looking at level 0 which should make sense I get the following

Code

# first check for NAs for the levels
exp_data$pbmc_cell_frequency %>%
  dplyr::left_join((meta_data %>%
                      dplyr::select(specimen_id, dataset) %>%
                      dplyr::distinct()),
                   by="specimen_id") %>%
  dplyr::left_join((celltype_meta %>%
                      dplyr::select(cell_type_name, dataset, level) %>%
                      dplyr::distinct()),
                    by=c("dataset", "cell_type_name")) %>%
  dplyr::filter(is.na(level))

Code

exp_data$pbmc_cell_frequency %>%
  dplyr::left_join((meta_data %>%
                      dplyr::select(specimen_id, dataset) %>%
                      dplyr::distinct()),
                   by="specimen_id") %>%
  dplyr::left_join((celltype_meta %>%
                      dplyr::select(cell_type_name, dataset, level) %>%
                      dplyr::distinct()),
                    by=c("dataset", "cell_type_name")) %>%
  dplyr::filter(level == 0) %>%
  dplyr::group_by(specimen_id) %>%
  dplyr::summarize(percent_live_cell_sum = sum(percent_live_cell, na.rm=TRUE)) %>%
  dplyr::left_join(meta_data, by="specimen_id") %>%
  ggplot() +
  geom_violin(aes(x=dataset, y=percent_live_cell_sum)) +
  ggdark::dark_mode(verbose=FALSE)

Especially, when completely ignoring the hierarchy in the gating information, I would expect to have more than 100% in percelt_live_cell_sum, but this is not true for the specimens from the 2023 dataset

Code

exp_data$pbmc_cell_frequency %>%
  dplyr::left_join((meta_data %>%
                      dplyr::select(specimen_id, dataset) %>%
                      dplyr::distinct()),
                   by="specimen_id") %>%
  dplyr::left_join((celltype_meta %>%
                      dplyr::select(cell_type_name, dataset, level) %>%
                      dplyr::distinct()),
                    by=c("dataset", "cell_type_name")) %>%
  dplyr::group_by(specimen_id) %>%
  dplyr::summarize(percent_live_cell_sum = sum(percent_live_cell, na.rm=TRUE)) %>%
  dplyr::left_join(meta_data, by="specimen_id") %>%
  ggplot() +
  geom_violin(aes(x=dataset, y=percent_live_cell_sum)) +
  ggdark::dark_mode(verbose=FALSE)

Or what I use a list of predefined cell types of interest?

10 Features for T cell Activation Assay (AIM)

Code

exp_data$t_cell_activation %>%
  dplyr::select(stimulation) %>%
  dplyr::distinct()

Code

exp_data$t_cell_activation %>%
  ggplot() +
  geom_histogram(aes(x=analyte_percentages), bins=100) +
  facet_wrap(~stimulation) +
  ggdark::dark_mode()

11 Features for T cell Polarization Assay (FluoroSpot)

Code

exp_data$t_cell_polarization %>%
  dplyr::left_join(protein_meta, by=c("protein_id"="uniprot_id")) %>%
  dplyr::select(protein_id, cytokine) %>%
  dplyr::distinct()

Code

exp_data$t_cell_polarization %>%
  dplyr::select(stimulation) %>%
  dplyr::distinct()

where

DMSO: “Negative” Control (solvent where peptides are dissolved in)
PHA (Phytohaemagglutinin): “Postive” Control (this should elicit strong T cell response?)
PT: Bordetella Pertussis Toxin

Code

exp_data$t_cell_polarization %>%
  dplyr::left_join(protein_meta, by=c("protein_id"="uniprot_id")) %>%
  ggplot() +
  geom_histogram(aes(x=analyte_counts), bins=100) +
  facet_grid(rows=vars(stimulation), cols=vars(cytokine)) +
  scale_x_log10() +
  ggdark::dark_mode()

Code

# TODO: the assay has been normalized relative to DMSO
exp_data$t_cell_polarization %>%
  dplyr::filter(stimulation=="DMSO") %>%
  dplyr::pull(analyte_counts) %>%
  dplyr::near(., 1) %>%
  mean()

[1] NaN

Code

exp_data$t_cell_polarization %>%
  dplyr::filter(stimulation!="DMSO") %>%
  dplyr::pull(stimulation_protein_id) %>%
  unique()

[1] "PHA_P01579" "PHA_Q16552" "PHA_P05113" "PT_P01579"  "PT_Q16552" 
[6] "PT_P05113"

12 Other Notes

Only from 2022 onwards do we have time-course baseline measurements, i.e. -30, -14/15, 0

Code

meta_data %>%
  dplyr::count(partition, dataset, planned_day_relative_to_boost) %>%
  dplyr::filter(planned_day_relative_to_boost < 0)

13 Appendix

Code

sessionInfo()

R version 4.2.3 (2023-03-15)
Platform: aarch64-apple-darwin20 (64-bit)
Running under: macOS 14.6.1

Matrix products: default
BLAS:   /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/lib/libRblas.0.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/lib/libRlapack.dylib

locale:
[1] C

attached base packages:
[1] grid      stats     graphics  grDevices datasets  utils     methods  
[8] base     

other attached packages:
 [1] circlize_0.4.16       ComplexHeatmap_2.15.4 magick_2.8.3         
 [4] FactoMineR_2.11       factoextra_1.0.7      ggdark_0.2.1         
 [7] lubridate_1.9.3       forcats_1.0.0         stringr_1.5.1        
[10] dplyr_1.1.4           purrr_1.0.2           readr_2.1.5          
[13] tidyr_1.3.1           tibble_3.2.1          ggplot2_3.5.1        
[16] tidyverse_2.0.0      

loaded via a namespace (and not attached):
 [1] bit64_4.5.2          vroom_1.6.5          jsonlite_1.8.9      
 [4] foreach_1.5.2        BiocManager_1.30.25  stats4_4.2.3        
 [7] renv_1.0.11          yaml_2.3.10          ggrepel_0.9.6       
[10] pillar_1.9.0         lattice_0.22-6       glue_1.8.0          
[13] digest_0.6.37        RColorBrewer_1.1-3   colorspace_2.1-1    
[16] cowplot_1.1.3        htmltools_0.5.8.1    pkgconfig_2.0.3     
[19] GetoptLong_1.0.5     xtable_1.8-4         mvtnorm_1.3-1       
[22] scales_1.3.0         tzdb_0.4.0           timechange_0.3.0    
[25] emmeans_1.10.5       farver_2.1.2         generics_0.1.3      
[28] IRanges_2.32.0       DT_0.33              withr_3.0.2         
[31] BiocGenerics_0.44.0  cli_3.6.3            magrittr_2.0.3      
[34] crayon_1.5.3         estimability_1.5.1   evaluate_1.0.1      
[37] fansi_1.0.6          doParallel_1.0.17    MASS_7.3-60.0.1     
[40] tools_4.2.3          hms_1.1.3            GlobalOptions_0.1.2 
[43] lifecycle_1.0.4      matrixStats_1.4.1    S4Vectors_0.36.2    
[46] munsell_0.5.1        cluster_2.1.6        flashClust_1.01-2   
[49] compiler_4.2.3       multcompView_0.1-10  rlang_1.1.4         
[52] iterators_1.0.14     rjson_0.2.23         htmlwidgets_1.6.4   
[55] leaps_3.2            labeling_0.4.3       rmarkdown_2.29      
[58] gtable_0.3.6         codetools_0.2-20     R6_2.5.1            
[61] knitr_1.49           bit_4.5.0            fastmap_1.2.0       
[64] utf8_1.2.4           clue_0.3-65          shape_1.4.6.1       
[67] stringi_1.8.4        parallel_4.2.3       Rcpp_1.0.13-1       
[70] vctrs_0.6.5          png_0.1-8            scatterplot3d_0.3-44
[73] tidyselect_1.2.1     xfun_0.49            coda_0.19-4.1