library(ithi.utils)
load_base_libs()
library(GenomicRanges)
library(nanotools)
library(ithi.meta)
library(ithi.ihc)
library(ithi.xcr)
library(ithi.expr)
pal_patient <- select_palette("patient")
db_path <- snakemake@params$db
ihc_table_path <- snakemake@input$ihc_table
ihc_table_slide_path <- snakemake@input$ihc_table_slide
xcr_table_path <- snakemake@input$xcr_table
prevalence_option <- snakemake@params$prevalence_option
distance_method <- snakemake@params$xcr_distance_method
filter_tissue <- snakemake@params$xcr_filter_tissue == "filter"
tcr_diversity_file <- snakemake@input$tcr_diversity
bcr_diversity_file <- snakemake@input$bcr_diversity
nanostring_data_path <- snakemake@input$nanostring_data
nanostring_annotations_path <- snakemake@input$nanostring_annotations
normalize_option <- snakemake@params$normalize %in% c("true", "TRUE", "True")
til_variance_option <- snakemake@params$til_variance_option
db <- src_sqlite(db_path, create = FALSE)
samples <- collect(tbl(db, "samples"))
ihc_table <- fread(ihc_table_path)
ihc_table_slide <- fread(ihc_table_slide_path)
ihc_samples <- ihc_table$condensed_id
if (any(table(ihc_samples) != 1)) {
stop("Duplicate condensed_ids from the same analysis type (IHC).")
}
tiltypes <- c("T_CD8_density", "T_CD4_density", "T_CD20_density", "T_Plasma_density")
plot_ihc_barplot(ihc_table, tiltypes, labs = c("CD8+ T cell", "CD4+ T cell",
"CD20+ B cell", "Plasma cell"), Y_MAX = 1500, db_path = db_path)
tiltypes <- c("T_CD8_density", "T_CD4_density", "T_CD20_density", "T_Plasma_density")
ihc_data <- subset(ihc_table, select = c("condensed_id", "patient_id", tiltypes))
ihc_data$tissue <- extract_tissue(ihc_data$condensed_id, type = "general", shorten = TRUE)
ihc_data_melted <- melt(ihc_data, id.vars = c("condensed_id", "patient_id",
"tissue"), measure.vars = tiltypes, variable.name = "Type", value.name = "Density")
RENAME_MAP <- list(T_CD8_density = "CD8+", T_CD4_density = "CD4+", T_CD20_density = "CD20+",
T_Plasma_density = "Plasma")
ihc_data_melted$Type <- mapvalues(ihc_data_melted$Type, from = names(RENAME_MAP),
to = unname(unlist(RENAME_MAP)))
## Calculate p-values
# We have multiple levels and multiple random effect levels. Residuals are
# ~approx normal. Let's use a GLM. I don't know of a nonparametric
# alternative for >=3 Tx levels.
pvals <- setNames(ddply(ihc_data_melted, .(Type), function(x) {
df <- subset(as.data.frame(x), !is.na(Density))
model <- lme(Density ~ tissue, random = ~1 | patient_id, data = df, method = "REML")
result <- anova.lme(model, type = "sequential", adjustSigma = FALSE)
tissue_pval <- result$`p-value`[2]
eq <- substitute(italic(P) == p, list(p = format(tissue_pval, digits = 3)))
return(as.character(as.expression(eq)))
}), c("Type", "p.value"))
p <- ggplot(ihc_data_melted, aes(x = Type, y = Density)) + geom_boxplot(aes(fill = tissue)) +
theme_bw() + theme_Publication() + scale_fill_Publication() + facet_wrap(~Type,
scales = "free", ncol = 2) + theme(axis.text.x = element_text(size = 0),
legend.text = element_text(size = 6, family = "Helvetica"), axis.text.y = element_text(size = 6,
family = "Helvetica"), axis.title = element_text(size = 8, family = "Helvetica")) +
geom_text(data = pvals, aes(x = Inf, y = Inf, label = p.value), hjust = 1.1,
vjust = 1.5, size = 3, parse = TRUE) + scale_y_continuous(trans = log10_trans(),
breaks = c(1, 10, 100, 1000))
p
ihcmat <- subset(ihc_table, select = c("E_CD8_density", "E_CD4_density", "E_CD20_density",
"E_Plasma_density", "S_CD8_density", "S_CD4_density", "S_CD20_density",
"S_Plasma_density"))
colnames(ihcmat) <- c("E CD8+", "E CD4+", "E CD20+", "E Plasma", "S CD8+", "S CD4+",
"S CD20+", "S Plasma")
corres <- corr.test(ihcmat, adjust = "fdr")
corres.r <- corres$r
corres.p <- corres$p
hc <- hclust(dist(corres.r), method = "ward.D")
diag(corres.r) <- NA
diag(corres.p) <- NA
corres.labels <- format(signif(corres.p, 2), 2)
cols <- colorRampPalette(c("#95cbee", "#ffd73e", "#ce472e"))(100)
par(family = "helvetica")
pheatmap(corres.r[hc$order, hc$order], display_numbers = corres.labels, color = cols,
number_color = "white", cluster_rows = FALSE, cluster_cols = FALSE)
xcr_table <- read_clonotypes(xcr_table_path, duplicates = FALSE, db_path, verbose = 1)
Read 29.5% of 304822 rows
Read 68.9% of 304822 rows
Read 304822 rows and 18 (of 18) columns from 0.070 GB file in 00:00:05
xcr_samples <- unique(xcr_table$condensed_id)
tcr_segment_type <- "TRB"
bcr_segment_type <- "IGH"
id_type <- "condensed_id"
Number of unique VOAs: 95, number of unique condensed_ids: 95.
tcr_clonotypes <- subset(xcr_table, type == tcr_segment_type)
tcr_cross_table <- cross_tabulate(tcr_clonotypes, id_type = id_type)
if (prevalence_option != "freq") {
summary <- tcr_clonotypes %>% group_by(condensed_id) %>% summarise(nclones = length(cdr3nt))
dists <- compute_immune_distance_matrix(tcr_cross_table, method = "num_overlaps")
} else {
summary <- tcr_clonotypes %>% group_by(condensed_id) %>% summarise(nreads = sum(count))
dists <- compute_immune_distance_matrix(tcr_cross_table, method = "num_reads")
}
overlapmat <- setNames(reshape2::melt(dists), c("sample1", "sample2", "divshared"))
overlapmat <- merge(overlapmat, setNames(summary, c("sample1", "div1")))
overlapmat <- merge(overlapmat, setNames(summary, c("sample2", "div2")))
overlapmat <- subset(overlapmat, sample1 != sample2)
overlapmat$sample1 <- as.character(overlapmat$sample1)
overlapmat$sample2 <- as.character(overlapmat$sample2)
df <- data.frame(sample1 = overlapmat$sample1, sample2 = overlapmat$sample2,
shared = overlapmat$divshared)
circos_plot(df, "shared", id_type = id_type, db_path = db_path)
bcr_clonotypes <- subset(xcr_table, type == bcr_segment_type)
bcr_cross_table <- cross_tabulate(bcr_clonotypes, id_type = id_type)
if (prevalence_option != "freq") {
summary <- bcr_clonotypes %>% group_by(condensed_id) %>% summarise(nclones = length(cdr3nt))
dists <- compute_immune_distance_matrix(bcr_cross_table, method = "num_overlaps")
} else {
summary <- bcr_clonotypes %>% group_by(condensed_id) %>% summarise(nreads = sum(count))
dists <- compute_immune_distance_matrix(bcr_cross_table, method = "num_reads")
}
overlapmat <- setNames(reshape2::melt(dists), c("sample1", "sample2", "divshared"))
overlapmat <- merge(overlapmat, setNames(summary, c("sample1", "div1")))
overlapmat <- merge(overlapmat, setNames(summary, c("sample2", "div2")))
overlapmat <- subset(overlapmat, sample1 != sample2)
overlapmat$sample1 <- as.character(overlapmat$sample1)
overlapmat$sample2 <- as.character(overlapmat$sample2)
df <- data.frame(sample1 = overlapmat$sample1, sample2 = overlapmat$sample2,
shared = overlapmat$divshared)
circos_plot(df, "shared", id_type = id_type, db_path = db_path)
cross_tables <- list(tcr = tcr_cross_table, bcr = bcr_cross_table)
The distance metric being used is horn.
distance_matrices <- lapply(cross_tables, function(cross_table) {
distmat <- compute_immune_distance_matrix(cross_table, method = distance_method)
mat <- as.matrix(distmat)
return(mat)
})
sims_raw <- lapply(distance_matrices, function(mat) {
sims <- average_intrapatient_similarity(mat, filter_tissue = filter_tissue,
id_type = id_type, db_path = db_path, raw = TRUE)
sims <- subset(sims, select = -c(patient2))
colnames(sims) <- mapvalues(colnames(sims), from = c("patient1"), to = c("patient"))
return(sims)
})
sims_avg <- lapply(distance_matrices, function(mat) {
average_intrapatient_similarity(mat, filter_tissue = filter_tissue, id_type = id_type,
db_path = db_path, raw = FALSE)
})
ignore <- lapply(list("tcr", "bcr"), function(segment) {
dat <- sims_raw[[segment]]
p <- ggplot(dat, aes(x = factor(patient), y = 1 - dist)) + geom_boxplot(aes(fill = factor(patient))) +
theme_bw() + theme_Publication() + scale_fill_manual(values = pal_patient) +
xlab("Patient") + ylab("Pairwise repertoire similarity") + guides(fill = guide_legend(title = "Patient"))
print(p)
})
mean_til_densities <- subset(ihc_table, select = -c(voa, site_id, condensed_id)) %>%
group_by(patient_id) %>% summarise_all(function(x) mean(x, na.rm = TRUE))
sims_avg_tcr_til <- merge(sims_avg$tcr, mean_til_densities, by = "patient_id")
sims_avg_bcr_til <- merge(sims_avg$bcr, mean_til_densities, by = "patient_id")
sims_avg_tcr_til$patient_id <- factor(sims_avg_tcr_til$patient_id)
sims_avg_bcr_til$patient_id <- factor(sims_avg_bcr_til$patient_id)
sims_avg_tcr_til <- subset(sims_avg_tcr_til, ncomparisons >= 6)
sims_avg_bcr_til <- subset(sims_avg_bcr_til, ncomparisons >= 6)
tiltypenames <- colnames(ihc_table)[str_detect(colnames(ihc_table), "^(T).*_density$")]
sims_avg_melted <- lapply(list(tcr = sims_avg_tcr_til, bcr = sims_avg_bcr_til),
function(merged_sims) {
other_cols <- colnames(merged_sims)[!colnames(merged_sims) %in% tiltypenames]
melt(merged_sims, id.vars = other_cols, measure.vars = tiltypenames,
variable.name = "Type", value.name = "value")
})
pvals <- lapply(sims_avg_melted, function(merged_sims_melt) {
setNames(ddply(merged_sims_melt, .(Type), function(x) {
df <- subset(as.data.frame(x), !is.na(value))
pval <- with(df, cor.test(median_sim, value, method = "spearman")$p.value)
# eq <- substitute(italic(P)==p, list(p=format(pval, digits=3)))
# return(as.character(as.expression(eq)))
return(pval)
}), c("Type", "p.value"))
})
pvals <- lapply(pvals, function(x) {
x$p.value <- p.adjust(x$p.value, method = "fdr")
x$p.value <- unlist(sapply(x$p.value, function(pval) as.character(as.expression(substitute(italic(P) ==
p, list(p = format(pval, digits = 3)))))))
return(x)
})
print(pvals$tcr)
Type p.value
1 T_CD8_density italic(P) == "0.00704"
2 T_CD4_density italic(P) == "0.142"
3 T_CD20_density italic(P) == "0.275"
4 T_Plasma_density italic(P) == "0.449"
5 T_Nonplasma_density italic(P) == "0.142"
print(pvals$bcr)
Type p.value
1 T_CD8_density italic(P) == "0.383"
2 T_CD4_density italic(P) == "0.383"
3 T_CD20_density italic(P) == "0.383"
4 T_Plasma_density italic(P) == "0.383"
5 T_Nonplasma_density italic(P) == "0.383"
ggplot(sims_avg_tcr_til, aes(x = median_sim, y = T_CD8_density)) + geom_point(aes(colour = patient_id)) +
theme_bw() + theme_Publication() + scale_color_manual(values = pal_patient) +
annotate(geom = "text", label = subset(pvals$tcr, Type == "T_CD8_density")$p.value,
x = Inf, y = Inf, hjust = 1, vjust = 1, parse = TRUE) + scale_y_log10()
ggplot(sims_avg_tcr_til, aes(x = median_sim, y = T_CD4_density)) + geom_point(aes(colour = patient_id)) +
theme_bw() + theme_Publication() + scale_color_manual(values = pal_patient) +
annotate(geom = "text", label = subset(pvals$tcr, Type == "T_CD4_density")$p.value,
x = Inf, y = Inf, hjust = 1, vjust = 1, parse = TRUE) + scale_y_log10()
ggplot(sims_avg_bcr_til, aes(x = median_sim, y = T_CD20_density)) + geom_point(aes(colour = patient_id)) +
theme_bw() + theme_Publication() + scale_color_manual(values = pal_patient) +
annotate(geom = "text", label = subset(pvals$bcr, Type == "T_CD20_density")$p.value,
x = Inf, y = Inf, hjust = 1, vjust = 1, parse = TRUE) + scale_y_log10()
ggplot(sims_avg_bcr_til, aes(x = median_sim, y = T_Plasma_density)) + geom_point(aes(colour = patient_id)) +
theme_bw() + theme_Publication() + scale_color_manual(values = pal_patient) +
annotate(geom = "text", label = subset(pvals$bcr, Type == "T_Plasma_density")$p.value,
x = Inf, y = Inf, hjust = 1, vjust = 1, parse = TRUE) + scale_y_log10()
Perhaps a better way of doing this is to use generalized linear models?
summary(glm(median_sim ~ T_CD8_density + T_CD4_density + T_CD20_density + T_Plasma_density,
data = sims_avg_tcr_til))
Call:
glm(formula = median_sim ~ T_CD8_density + T_CD4_density + T_CD20_density +
T_Plasma_density, data = sims_avg_tcr_til)
Deviance Residuals:
Min 1Q Median 3Q Max
-0.17954 -0.12617 -0.09175 0.06449 0.41226
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.1766786 0.1120449 1.577 0.1408
T_CD8_density 0.0019203 0.0008616 2.229 0.0457 *
T_CD4_density -0.0013891 0.0025360 -0.548 0.5939
T_CD20_density 0.0044570 0.0077225 0.577 0.5745
T_Plasma_density -0.0026832 0.0032726 -0.820 0.4283
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for gaussian family taken to be 0.04466831)
Null deviance: 0.89980 on 16 degrees of freedom
Residual deviance: 0.53602 on 12 degrees of freedom
AIC: 1.4783
Number of Fisher Scoring iterations: 2
summary(glm(median_sim ~ T_CD8_density + T_CD4_density + T_CD20_density + T_Plasma_density,
data = sims_avg_bcr_til))
Call:
glm(formula = median_sim ~ T_CD8_density + T_CD4_density + T_CD20_density +
T_Plasma_density, data = sims_avg_bcr_til)
Deviance Residuals:
Min 1Q Median 3Q Max
-0.169546 -0.084801 0.002978 0.051342 0.262292
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.1645175 0.0662052 2.485 0.02870 *
T_CD8_density 0.0009342 0.0005091 1.835 0.09140 .
T_CD4_density -0.0001514 0.0014985 -0.101 0.92119
T_CD20_density 0.0074466 0.0045631 1.632 0.12864
T_Plasma_density -0.0060288 0.0019337 -3.118 0.00889 **
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for gaussian family taken to be 0.01559549)
Null deviance: 0.37101 on 16 degrees of freedom
Residual deviance: 0.18715 on 12 degrees of freedom
AIC: -16.41
Number of Fisher Scoring iterations: 2
richness_column <- "observedDiversity_mean"
shannon_column <- "shannonWienerIndex_mean"
inverse_simpson_column <- "inverseSimpsonIndex_mean"
diversity_files <- list(tcr = tcr_diversity_file, bcr = bcr_diversity_file)
diversity <- lapply(diversity_files, function(f) {
read_xcr_diversity_file(f, db_path)
})
plot_diversity <- function(col) {
ignore <- lapply(diversity, function(df) {
p <- ggplot(df, aes_string(x = "patient_id", y = col)) + geom_boxplot(aes(fill = patient_id)) +
theme_bw() + theme_Publication() + scale_fill_manual(values = pal_patient) +
xlab("Patient") + ylab("Diversity")
print(p)
})
}
plot_diversity(richness_column)
plot_diversity(shannon_column)
plot_diversity(inverse_simpson_column)
exprs <- fread(nanostring_data_path)
labels <- fread(nanostring_annotations_path)
exprmat <- as.data.frame(getExprs(exprs))
annomat <- getAnnos(exprs)
rownames(exprmat) <- annomat$Name
nanostring_samples <- map_id(colnames(exprmat), from = "voa", to = "condensed_id",
db_path = db_path)
if (any(table(nanostring_samples) != 1)) {
stop("Duplicate condensed_ids from the same analysis type (Nanostring).")
}
patient_strings <- as.character(map_id(colnames(exprmat), from = "voa", to = "patient_id",
db_path = db_path))
## There are no duplicate condensed_ids when mapping from FFPE voa's, so we
## can convert here for readability
colnames(exprmat) <- nanostring_samples
annotations <- merge(annomat, labels, by.x = "Name", by.y = "Gene")
patients <- data.frame(Patient = patient_strings)
rownames(patients) <- colnames(exprmat)
cell_types <- subset(annotations, cell_type != "")$cell_type
cell_type_genes <- subset(annotations, cell_type != "")$Name[order(cell_types)]
annorow <- as.data.frame(subset(annotations, select = c(cell_type)))
rownames(annorow) <- annotations$Name
colnames(annorow)[1] <- "Cell Type"
annocol <- patients
annocolors <- list(`Cell Type` = get_colour_palette(unique(cell_types)), Patient = select_palette("patient"))
mat <- clip_values(t(scale(t(exprmat[cell_type_genes, ]))), hi = 2, lo = -2)
gaps <- which(rev(!duplicated(rev(cell_types[order(cell_types)]))))
pheatmap(mat, annotation_row = annorow, annotation_col = annocol, annotation_colors = annocolors,
cluster_rows = FALSE, show_colnames = FALSE, clustering_method = "ward.D",
gaps_row = gaps)
celltype_matrix <- create_celltype_matrix(exprs, labels, db_path)
celltype_matrix_scaled <- clip_values(t(scale(t(celltype_matrix))), hi = 2,
lo = -2)
pheatmap(celltype_matrix_scaled, annotation_col = annocol, annotation_colors = annocolors,
cluster_rows = TRUE, show_colnames = FALSE, clustering_method = "ward.D")
celltype_df <- data.frame(celltype_matrix %>% t, check.names = FALSE) %>% rownames_to_column(var = "condensed_id")
celltype_df$patient_id <- map_id(celltype_df$condensed_id, from = "condensed_id",
to = "patient_id", db_path)
mean_celltype_expression <- subset(celltype_df, select = -c(condensed_id)) %>%
group_by(patient_id) %>% summarise_all(function(x) mean(x, na.rm = TRUE))
sims_avg_tcr_expr <- merge(sims_avg$tcr, mean_celltype_expression, by = "patient_id")
sims_avg_bcr_expr <- merge(sims_avg$bcr, mean_celltype_expression, by = "patient_id")
sims_avg_tcr_expr$patient_id <- factor(sims_avg_tcr_expr$patient_id)
sims_avg_bcr_expr$patient_id <- factor(sims_avg_bcr_expr$patient_id)
exprnames <- colnames(celltype_df)[!colnames(celltype_df) %in% c("condensed_id",
"patient_id")]
sims_avg_melted <- lapply(list(tcr = sims_avg_tcr_expr, bcr = sims_avg_bcr_expr),
function(merged_sims) {
other_cols <- colnames(merged_sims)[!colnames(merged_sims) %in% exprnames]
melt(merged_sims, id.vars = other_cols, measure.vars = exprnames, variable.name = "Type",
value.name = "value")
})
pvals <- lapply(sims_avg_melted, function(merged_sims_melt) {
setNames(ddply(merged_sims_melt, .(Type), function(x) {
df <- subset(as.data.frame(x), !is.na(value))
pval <- with(df, cor.test(mean_sim, value, method = "spearman")$p.value)
eq <- substitute(italic(P) == p, list(p = format(pval, digits = 3)))
return(as.character(as.expression(eq)))
}), c("Type", "p.value"))
})
print(pvals$tcr)
Type p.value
1 aDC italic(P) == "0.0138"
2 B-cell italic(P) == "0.177"
3 CD8 T-cell italic(P) == "0.00941"
4 Cytotoxic cell italic(P) == "0.000607"
5 DC italic(P) == "0.992"
6 Eosinophils italic(P) == "0.0111"
7 iDC italic(P) == "0.199"
8 Macrophages italic(P) == "0.211"
9 Mast cell italic(P) == "0.0598"
10 Neutrophils italic(P) == "0.368"
11 NK CD56bright cell italic(P) == "0.042"
12 NK CD56dim cell italic(P) == "0.00021"
13 NK cell italic(P) == "0.219"
14 pDC italic(P) == "0.807"
15 T helper cell italic(P) == "0.677"
16 T-cell italic(P) == "0.000655"
17 Tcm italic(P) == "0.00029"
18 Tem italic(P) == "0.538"
19 TFH italic(P) == "0.00151"
20 Tgd italic(P) == "0.617"
21 Th1 cell italic(P) == "0.0132"
22 Th17 cell italic(P) == "0.347"
23 Th2 cell italic(P) == "0.967"
24 Treg italic(P) == "0.368"
print(pvals$bcr)
Type p.value
1 aDC italic(P) == "0.202"
2 B-cell italic(P) == "0.0442"
3 CD8 T-cell italic(P) == "0.237"
4 Cytotoxic cell italic(P) == "0.199"
5 DC italic(P) == "0.46"
6 Eosinophils italic(P) == "0.695"
7 iDC italic(P) == "0.183"
8 Macrophages italic(P) == "0.649"
9 Mast cell italic(P) == "0.0712"
10 Neutrophils italic(P) == "0.0197"
11 NK CD56bright cell italic(P) == "0.916"
12 NK CD56dim cell italic(P) == "0.129"
13 NK cell italic(P) == "0.846"
14 pDC italic(P) == "0.382"
15 T helper cell italic(P) == "0.972"
16 T-cell italic(P) == "0.237"
17 Tcm italic(P) == "0.129"
18 Tem italic(P) == "0.767"
19 TFH italic(P) == "0.239"
20 Tgd italic(P) == "0.337"
21 Th1 cell italic(P) == "0.429"
22 Th17 cell italic(P) == "0.0524"
23 Th2 cell italic(P) == "0.0757"
24 Treg italic(P) == "0.382"
ggplot(sims_avg_tcr_expr, aes(x = mean_sim, y = `CD8 T-cell`)) + geom_point(aes(colour = patient_id)) +
theme_bw() + theme_Publication() + scale_color_manual(values = pal_patient) +
annotate(geom = "text", label = subset(pvals$tcr, Type == "CD8 T-cell")$p.value,
x = Inf, y = Inf, hjust = 1, vjust = 1, parse = TRUE)
ggplot(sims_avg_tcr_expr, aes(x = mean_sim, y = `T helper cell`)) + geom_point(aes(colour = patient_id)) +
theme_bw() + theme_Publication() + scale_color_manual(values = pal_patient) +
annotate(geom = "text", label = subset(pvals$tcr, Type == "T helper cell")$p.value,
x = Inf, y = Inf, hjust = 1, vjust = 1, parse = TRUE)
ggplot(sims_avg_tcr_expr, aes(x = mean_sim, y = `T-cell`)) + geom_point(aes(colour = patient_id)) +
theme_bw() + theme_Publication() + scale_color_manual(values = pal_patient) +
annotate(geom = "text", label = subset(pvals$tcr, Type == "T-cell")$p.value,
x = Inf, y = Inf, hjust = 1, vjust = 1, parse = TRUE)
ggplot(sims_avg_tcr_expr, aes(x = mean_sim, y = `Th1 cell`)) + geom_point(aes(colour = patient_id)) +
theme_bw() + theme_Publication() + scale_color_manual(values = pal_patient) +
annotate(geom = "text", label = subset(pvals$tcr, Type == "Th1 cell")$p.value,
x = Inf, y = Inf, hjust = 1, vjust = 1, parse = TRUE)
ggplot(sims_avg_tcr_expr, aes(x = mean_sim, y = `Th2 cell`)) + geom_point(aes(colour = patient_id)) +
theme_bw() + theme_Publication() + scale_color_manual(values = pal_patient) +
annotate(geom = "text", label = subset(pvals$tcr, Type == "Th2 cell")$p.value,
x = Inf, y = Inf, hjust = 1, vjust = 1, parse = TRUE)
ggplot(sims_avg_bcr_expr, aes(x = mean_sim, y = `B-cell`)) + geom_point(aes(colour = patient_id)) +
theme_bw() + theme_Publication() + scale_color_manual(values = pal_patient) +
annotate(geom = "text", label = subset(pvals$bcr, Type == "B-cell")$p.value,
x = Inf, y = Inf, hjust = 1, vjust = 1, parse = TRUE)
Requires that condensed_ids are never duplicated in any analysis type. I’ve inserted checks into each of the sections above to ensure this.
We’ll work with cell-type metagenes as working with all expression values will be biased by the number of genes associated with each cell type.
tiltypes <- c("T_CD8_density", "T_CD4_density", "T_CD20_density", "T_Plasma_density")
ihc_slide_data <- subset(ihc_table_slide, select = c("condensed_id", "patient_id",
"slide", tiltypes))
ihc_melted <- melt(ihc_slide_data, id.vars = c("condensed_id", "patient_id",
"slide"), measure.vars = tiltypes, variable.name = "tiltype", value.name = "density")
ihc_melted$tiltype <- as.character(ihc_melted$tiltype)
T_CELL_TYPES <- c("T_CD8_density", "T_CD4_density")
B_CELL_TYPES <- c("T_CD20_density", "T_Plasma_density")
NANOSTRING_T_TYPES <- c("CD8 T-cell", "Cytotoxic cell", "T-cell", "T helper cell",
"Th1 cell", "Th2 cell", "Th17 cell", "Treg", "Tcm", "Tem")
NANOSTRING_B_TYPES <- c("B-cell", "TFH")
XCR_T_TYPES <- c("tcr")
XCR_B_TYPES <- c("bcr")
cols <- colorRampPalette(c("#95cbee", "#ffd73e", "#ce472e"))(100)
corplot <- function(var1, var2, sub1, sub2) {
cor_df <- Reduce(function(x, y) merge(x, y, by = "patient_id"), list(var1,
var2))
cor_matrix <- subset(cor_df, select = -c(patient_id))
corres <- corr.test(cor_matrix, method = "pearson", adjust = "none")
# hc <- hclust(dist(corres$r), method='ward.D')
diag(corres$r) <- NA
diag(corres$p) <- NA
corres.r <- corres$r[sub1, sub2]
corres.p <- corres$p[sub1, sub2]
corres.p.labels <- format(signif(corres.p, 2), 2)
pheatmap(corres.r, display_numbers = corres.p.labels, color = cols, number_color = "white",
cluster_rows = TRUE, cluster_cols = TRUE)
}
intersect_samples <- compute_sample_intersection(samples_list = list(ihc_samples,
xcr_samples))
til_var <- compute_ihc_var(ihc_melted, ihc_table, samples = intersect_samples,
til_variance_option = "stabilize", tiltypes)
xcr_var <- compute_xcr_var(distance_matrices, samples = intersect_samples, filter_tissue = FALSE,
id_type = "condensed_id", db_path)
ihc_entries <- c(T_CELL_TYPES, B_CELL_TYPES)
xcr_entries <- c(XCR_T_TYPES, XCR_B_TYPES)
corplot(til_var, xcr_var, ihc_entries, xcr_entries)
intersect_samples <- compute_sample_intersection(samples_list = list(ihc_samples,
nanostring_samples))
til_var <- compute_ihc_var(ihc_melted, ihc_table, samples = intersect_samples,
til_variance_option = "stabilize", tiltypes)
expr_var <- compute_expr_var(celltype_matrix, samples = intersect_samples, scale = TRUE)
ihc_entries <- c(T_CELL_TYPES, B_CELL_TYPES)
expr_entries <- c(NANOSTRING_T_TYPES, NANOSTRING_B_TYPES)
corplot(til_var, expr_var, ihc_entries, expr_entries)
intersect_samples <- compute_sample_intersection(samples_list = list(xcr_samples,
nanostring_samples))
xcr_var <- compute_xcr_var(distance_matrices, samples = intersect_samples, filter_tissue = FALSE,
id_type = "condensed_id", db_path)
expr_var <- compute_expr_var(celltype_matrix, samples = intersect_samples, scale = TRUE)
xcr_entries <- c(XCR_T_TYPES, XCR_B_TYPES)
expr_entries <- c(NANOSTRING_T_TYPES, NANOSTRING_B_TYPES)
corplot(xcr_var, expr_var, xcr_entries, expr_entries)
We can also look at consistency in intrapatient similarity on a pairwise basis.
scalar_dist_method <- "manhattan"
ihc_stabilized <- stabilize_ihc_variances(ihc_slide_data, ihc_table, tiltypes)
ihc_stabilized$condensed_id <- factor_id(ihc_stabilized$condensed_id, "condensed_id",
db_path)
ihc_sims <- pairwise_similarity_frame(ihc_stabilized, id_col = id_type, group = "patient_id",
method = scalar_dist_method)
## always filter_tissue = FALSE
xcr_sims_raw_all <- lapply(names(distance_matrices), function(segment) {
mat <- distance_matrices[[segment]]
sims <- average_intrapatient_similarity(mat, filter_tissue = FALSE, id_type = id_type,
db_path = db_path, raw = TRUE)
sims <- subset(sims, select = -c(patient2))
colnames(sims) <- mapvalues(colnames(sims), from = c("patient1", "dist"),
to = c("patient_id", segment))
return(sims)
})
names(xcr_sims_raw_all) <- names(distance_matrices)
celltype_df <- rownames_to_column(as.data.frame(t(celltype_matrix)), var = id_type)
celltype_df$patient_id <- map_id(celltype_df$condensed_id, from = "condensed_id",
to = "patient_id", db_path)
celltype_sims <- pairwise_similarity_frame(celltype_df, id_col = id_type, group = "patient_id",
method = scalar_dist_method)
ihc_xcr_sims <- Reduce(plyr::join, c(xcr_sims_raw_all, list(ihc_sims)))
ihc_entries <- c(T_CELL_TYPES, B_CELL_TYPES)
xcr_entries <- c(XCR_T_TYPES, XCR_B_TYPES)
cor_matrix <- subset(ihc_xcr_sims, select = c(ihc_entries, xcr_entries))
corres <- corr.test(cor_matrix, method = "spearman", adjust = "none")
# hc <- hclust(dist(corres$r), method='ward.D')
diag(corres$r) <- NA
diag(corres$p) <- NA
corres.r <- corres$r[ihc_entries, xcr_entries]
corres.p <- corres$p[ihc_entries, xcr_entries]
corres.p.labels <- format(signif(corres.p, 2), 2)
pheatmap(corres.r, display_numbers = corres.p.labels, color = cols, number_color = "white",
cluster_rows = TRUE, cluster_cols = TRUE)
xcr_sims_melted <- Reduce(plyr::join, xcr_sims_raw_all)
xcr_sims_melted <- melt(xcr_sims_melted, id.vars = colnames(xcr_sims_melted)[!colnames(xcr_sims_melted) %in%
xcr_entries], measure.vars = xcr_entries, variable.name = "segment", value.name = "xcrsim")
ihc_sims_melted <- melt(ihc_sims, id.vars = colnames(ihc_sims)[!colnames(ihc_sims) %in%
ihc_entries], measure.vars = ihc_entries, variable.name = "tiltype", value.name = "ihcsim")
ihc_xcr_melted <- merge(ihc_sims_melted, xcr_sims_melted, by = c("sample1",
"sample2", "patient_id"))
ggplot(ihc_xcr_melted, aes(x = ihcsim, y = xcrsim)) + geom_point(aes(colour = factor(patient_id))) +
facet_wrap(tiltype ~ segment, scales = "free") + theme_bw() + theme_Publication() +
scale_colour_manual(values = pal_patient)
summary(lmer(xcrsim ~ ihcsim + (1 | patient_id), data = subset(ihc_xcr_melted,
tiltype == "T_CD8_density" & segment == "tcr" & !is.na(ihcsim) & !is.na(xcrsim))))
Linear mixed model fit by REML t-tests use Satterthwaite approximations
to degrees of freedom [lmerMod]
Formula: xcrsim ~ ihcsim + (1 | patient_id)
Data: subset(ihc_xcr_melted, tiltype == "T_CD8_density" & segment ==
"tcr" & !is.na(ihcsim) & !is.na(xcrsim))
REML criterion at convergence: -87.7
Scaled residuals:
Min 1Q Median 3Q Max
-4.4652 -0.4055 0.0996 0.4997 1.5398
Random effects:
Groups Name Variance Std.Dev.
patient_id (Intercept) 0.05215 0.2284
Residual 0.01781 0.1334
Number of obs: 121, groups: patient_id, 17
Fixed effects:
Estimate Std. Error df t value Pr(>|t|)
(Intercept) 6.911e-01 5.951e-02 1.853e+01 11.613 6.07e-10 ***
ihcsim 7.561e-03 4.369e-02 1.116e+02 0.173 0.863
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Correlation of Fixed Effects:
(Intr)
ihcsim -0.271
Critically, TCR similarity does not correlate with CD8+ or CD4+ T-cell density similarity.
ihc_expr_sims <- Reduce(plyr::join, list(ihc_sims, celltype_sims))
ihc_entries <- c(T_CELL_TYPES, B_CELL_TYPES)
expr_entries <- c(NANOSTRING_T_TYPES, NANOSTRING_B_TYPES)
cor_matrix <- subset(ihc_expr_sims, select = c(ihc_entries, expr_entries))
corres <- corr.test(cor_matrix, method = "spearman", adjust = "none")
# hc <- hclust(dist(corres$r), method='ward.D')
diag(corres$r) <- NA
diag(corres$p) <- NA
corres.r <- corres$r[ihc_entries, expr_entries]
corres.p <- corres$p[ihc_entries, expr_entries]
corres.p.labels <- format(signif(corres.p, 2), 2)
pheatmap(corres.r, display_numbers = corres.p.labels, color = cols, number_color = "white",
cluster_rows = TRUE, cluster_cols = TRUE)
xcr_expr_sims <- Reduce(plyr::join, c(xcr_sims_raw_all, list(celltype_sims)))
xcr_entries <- c(XCR_T_TYPES, XCR_B_TYPES)
expr_entries <- c(NANOSTRING_T_TYPES, NANOSTRING_B_TYPES)
cor_matrix <- subset(xcr_expr_sims, select = c(xcr_entries, expr_entries))
corres <- corr.test(cor_matrix, method = "spearman", adjust = "none")
# hc <- hclust(dist(corres$r), method='ward.D')
diag(corres$r) <- NA
diag(corres$p) <- NA
corres.r <- corres$r[xcr_entries, expr_entries]
corres.p <- corres$p[xcr_entries, expr_entries]
corres.p.labels <- format(signif(corres.p, 2), 2)
pheatmap(corres.r, display_numbers = corres.p.labels, color = cols, number_color = "white",
cluster_rows = TRUE, cluster_cols = TRUE)
These results aren’t really consistent with the patient-level results. It’s important to note, however, that these correlations are calculated naive to the patient labels. i.e. there’s no controlling for patient with a random intercept and slope, etc.
Things become more difficult to interpret if we incorporate random intercepts and slopes. There could be high correlations within patients but low correlations between patients, and vice versa.
Think about how we can show this in a statistically robust way.
pairwise_results_filtered <- ihc_pairwise_significance(ihc_table_slide, tiltypes,
db_path, slide_count_filter = 10)
Q_VALUE_THRESHOLD <- 0.01
pairwise_results_filtered$significant <- pairwise_results_filtered$q.value <
Q_VALUE_THRESHOLD
pairwise_tiltype_summary <- pairwise_results_filtered %>% group_by(patient_id,
tiltype) %>% summarise(any_significant = any(significant))
pairwise_tiltype_count <- pairwise_tiltype_summary %>% group_by(tiltype) %>%
summarise(npatients = sum(any_significant, na.rm = TRUE))
pairwise_tiltype_count
x <- pairwise_tiltype_summary %>% group_by(tiltype) %>% do(sigvec = .$any_significant)
x_pairs <- as_tibble(merge(x, x, by = c()))
x_pairs_p <- x_pairs %>% group_by(tiltype.x, tiltype.y) %>% summarise(p.value = pchisq(GenomicRanges::phicoef(table(unlist(sigvec.x),
unlist(sigvec.y)))^2 * sum(table(unlist(sigvec.x), unlist(sigvec.y))), df = 1,
lower.tail = FALSE))
x_pairs_p$q.value <- p.adjust(x_pairs_p$p.value, method = "fdr")
x_pairs_p