Cell counts matrix

# ==== COUNTS TABLE ============================================================

# read database of counts
count_long = readxl::read_excel("output/counts_Assignment_db.xlsx")
count_long$cluster = as.character(count_long$cluster)


# ==== SAMPLE TABLE ============================================================

# create sample database
sample_data = data.frame(file = unique(count_long$file))

# create a simplified sample id
sample_data$sample_id = sub(".fcs.astrolabe.fcs", "", sample_data$file, fixed = TRUE)
sample_data$sample_id = sub("_EHA001", "", sample_data$sample_id, fixed = TRUE)
sample_data$sample_id = sub("_1_Patients", "", sample_data$sample_id, fixed = TRUE)
# set it as key index of rows
rownames(sample_data) = sample_data$sample_id

# extract batch, patient id, condition from sample id
sample_var = Reduce(rbind.data.frame, strsplit(sample_data$sample_id, "_"))
colnames(sample_var) = c("batch", "patient", "condition")
# append these columns
sample_data = cbind(sample_data, sample_var)

# order samples
order_sample = with(sample_data, order(condition, batch, patient))


# ==== CLUSTER TABLE ===========================================================

# read cluster database
cluster_long = read.csv("input/attachments/Assignment.csv")
cluster_long$Value = as.character(cluster_long$Value)


# ==== MATRIX OF COUNTS ========================================================

# dimension of the matrix of counts
col_names = sample_data$file[order_sample]
row_names = cluster_long$Value

# put extracted counts into a matrix
counts = matrix(0, nrow = length(row_names), ncol = length(col_names),
                dimnames = list(row_names, col_names))
counts[cbind(count_long$cluster, count_long$file)] = count_long$counts

# assign comprehensive names
colnames(counts) = sample_data$sample[order_sample]
rownames(counts) = cluster_long$CellSubset


# ==== FILTER OUT LOW COUNTS ===================================================

# This an optional step that simplifies the Excel processing

# require that a minimal total count
keep_total = rowSums(counts) > 43 * 50
# require that at most 75% of counts are zero
keep_quart = apply(counts, 1, quantile, 0.75) > 0
kept_clusters = which(keep_total & keep_quart)

# update counts table
counts = counts[ kept_clusters, ]


# ==== VIEW, SAVE ==============================================================

knitr::kable(counts[1:9, 1:5])
160406_1758_Acute 160406_1760_Acute 160406_1773_Acute 160406_1785_Acute 160406_1790_Acute
B Cell (CD27-) 1166 9089 1626 1286 2551
B Cell (Memory) 187 685 296 608 813
B Cell (Plasmablast) 287 411 85 890 494
B Cell_unassigned 2184 8841 3530 2531 2930
Basophil 39 260 13 185 0
CD4+ CD8+ T Cell 0 4306 1324 7155 6379
CD4+ T Cell (Central Memory) 5153 6484 2680 2437 9530
CD4+ T Cell (Effector Memory) 872 320 369 1553 3427
CD4+ T Cell (EMRA) 128 174 80 447 222
counts_df = cbind.data.frame(clusters = rownames(counts), counts)
writexl::write_xlsx(
  counts_df, "output/counts_Assignment_matrix.xlsx")

save(
  counts, sample_data, order_sample, 
  file = "output/counts_Assignment.RData")

Median Fluorescence Intensity matrix

# ==== MFIS TABLE ==============================================================

# read database of mfis
mfi_long = readxl::read_excel("output/mfi_Assignment_median_db.xlsx")
mfi_long$cluster = as.character(mfi_long$cluster)


# ==== MARKER TABLE ============================================================

# load markers table and retain selected markers
mrk_long = as.data.frame(
  readxl::read_excel("output/fcs_markers.xlsx"))
# filter out useless markers (which removes NA)
mrk_long = mrk_long[ which(mrk_long$useful == 1), c("name", "desc")]

# reduce MFI table to selected markers
tbl = merge(mfi_long, mrk_long, by.x = "marker", by.y = "name")

# reduce MFI table to selected clusters
tbl = merge(
  tbl, data.frame(cluster = cluster_long$Value[kept_clusters]))


# ==== MATRIX OF MFIS ==========================================================

# dimension of the matrix of mfis
col_names = mrk_long$desc
row_names = cluster_long$Value[kept_clusters]

# extract MFI DB
res = with(tbl, aggregate(
  tbl[, "median_mfi", drop = FALSE], list(marker = desc, cluster = cluster), median))
# weighted MFI version
# append counts
tbl = merge(tbl, as.data.frame(count_long[, c("cluster", "file", "counts")]))
# split
resw = split(tbl, ~ desc + cluster)
# compute for each data chunk
resw =  lapply(resw, function(df) 
  list(marker = df$desc[1], cluster = df$cluster[1],
       median_mfi = limma::weighted.median(df$median_mfi, df$counts)))
# assemble
resw = Reduce(rbind.data.frame, resw)

# df = merge(res, resw, by = c("marker", "cluster"))
# plot(asinh(df$median_mfi.x/5), asinh(df$median_mfi.y/5), asp = 1)
# library(ggplot2)
# ggplot(df, aes(asinh(median_mfi.x/5), asinh(median_mfi.y/5))) + geom_point() +
#   facet_wrap(~marker)
# ggplot(df, aes(asinh(median_mfi.x/5), asinh(median_mfi.y/5), col = marker)) + geom_point()

# put extracted mfis into a matrix
mfis = matrix(0, nrow = length(row_names), ncol = length(col_names),
              dimnames = list(row_names, col_names))
mfis[cbind(res$cluster, res$marker)] = resw$median_mfi

# assign comprehensive names to clusters
rownames(mfis) = cluster_long$CellSubset[kept_clusters]


# ==== VIEW, SAVE ==============================================================

knitr::kable(mfis[1:9, 1:5])
CD57 CD45 CD19 CD45RA CD141
B Cell (CD27-) 5.4535198 126.62884 107.1281357 130.730270 3.1227607
B Cell (Memory) 6.7174575 138.85619 120.0290985 164.852600 4.9270062
B Cell (Plasmablast) 2.8377441 78.54947 38.5903625 79.405418 0.8412668
B Cell_unassigned 5.0418935 116.94312 101.4108276 114.130939 2.6247699
Basophil 0.5703478 36.11363 0.0000000 2.944027 0.0000000
CD4+ CD8+ T Cell 5.9221554 148.14133 0.3653206 20.671368 1.1517029
CD4+ T Cell (Central Memory) 4.2089591 126.08840 0.0000000 5.122553 0.5479562
CD4+ T Cell (Effector Memory) 5.6014166 139.60620 0.0000000 2.720707 0.6345142
CD4+ T Cell (EMRA) 6.4672663 112.40231 0.5579739 20.046161 0.6310581
mfis_df = cbind.data.frame(clusters = rownames(mfis), mfis)
writexl::write_xlsx(
  mfis_df, "output/mfis_Assignment_matrix.xlsx")

save(mfis, sample_data, order_sample, file = "output/mfis_Assignment.RData")

Median Fluorescence Intensity matrix per sample

# ==== MATRIX OF MFIS PER SAMPLE ===============================================

# extract MFI DB
res = tbl
res$key = sprintf("%s $$ %s", res$desc, res$cluster)

# dimension of the matrix of mfis
col_names = sample_data$file[order_sample]
row_names_df = expand.grid(
  cluster = cluster_long$Value[kept_clusters], desc = mrk_long$desc)
row_names = row_names_df$key = 
  sprintf("%s $$ %s", row_names_df$desc, row_names_df$cluster)

# put extracted mfis into a matrix
mfis_samples = matrix(
  0, nrow = length(row_names), ncol = length(col_names),
  dimnames = list(row_names, col_names))
mfis_samples[cbind(res$key, res$file)] = res$median_mfi

row_names_df$key_desc = sprintf(
  "%s - %s", row_names_df$desc, cluster_long$CellSubset[row_names_df$cluster])

# assign comprehensive names
colnames(mfis_samples) = sample_data$sample[order_sample]
rownames(mfis_samples) = row_names_df$key_desc


# ==== VIEW, SAVE ==============================================================

knitr::kable(mfis_samples[1:9, 1:5])
160406_1758_Acute 160406_1760_Acute 160406_1773_Acute 160406_1785_Acute 160406_1790_Acute
CD57 - B Cell (CD27-) 2.493280 3.2060740 4.111000 2.464065 3.588324
CD57 - B Cell (Memory) 3.688186 4.6267438 5.930459 3.048475 3.798380
CD57 - B Cell (Plasmablast) 1.913485 1.8589548 1.783925 1.647928 1.613323
CD57 - B Cell_unassigned 1.893067 2.3921435 3.088092 1.898907 2.689122
CD57 - Basophil 0.000000 0.0621516 2.546956 0.000000 0.000000
CD57 - CD4+ CD8+ T Cell 0.000000 2.6537449 3.119385 1.494454 3.703859
CD57 - CD4+ T Cell (Central Memory) 1.411105 1.8586383 1.728350 1.115001 2.743009
CD57 - CD4+ T Cell (Effector Memory) 1.787999 1.2110798 3.414195 1.413289 3.924535
CD57 - CD4+ T Cell (EMRA) 1.725193 1.3881128 147.955566 1.970000 198.599983
mfis_samples_df = cbind.data.frame(
  "marker - cluster" = rownames(mfis_samples), mfis_samples)
writexl::write_xlsx(
  as.data.frame(mfis_samples_df), "output/mfis_samples_Assignment_matrix.xlsx")

save(mfis_samples_df, sample_data, order_sample, 
     file = "output/mfis_samples_Assignment.RData")

Script session

Session info
## R version 4.4.2 (2024-10-31 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 22631)
## 
## Matrix products: default
## 
## 
## locale:
## [1] LC_COLLATE=French_France.utf8  LC_CTYPE=French_France.utf8   
## [3] LC_MONETARY=French_France.utf8 LC_NUMERIC=C                  
## [5] LC_TIME=French_France.utf8    
## 
## time zone: Europe/Paris
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## loaded via a namespace (and not attached):
##  [1] vctrs_0.6.5       cli_3.6.3         knitr_1.49        rlang_1.1.4      
##  [5] xfun_0.50         jsonlite_1.8.9    glue_1.8.0        statmod_1.5.0    
##  [9] htmltools_0.5.8.1 sass_0.4.9        readxl_1.4.3      writexl_1.5.1    
## [13] rmarkdown_2.29    cellranger_1.1.0  evaluate_1.0.3    jquerylib_0.1.4  
## [17] tibble_3.2.1      fastmap_1.2.0     yaml_2.3.10       lifecycle_1.0.4  
## [21] compiler_4.4.2    pkgconfig_2.0.3   limma_3.62.2      rstudioapi_0.17.1
## [25] digest_0.6.37     R6_2.5.1          pillar_1.10.1     magrittr_2.0.3   
## [29] bslib_0.8.0       tools_4.4.2       cachem_1.1.0