Metadata
db <- src_sqlite(db_path, create = FALSE)
samples <- collect(tbl(db, "samples"))
read_mutsig_output <- function(dir, sample_table, external = FALSE) {
prop_table_path <- Sys.glob(file.path(dir, "*_props.tsv"))
sigs_path <- Sys.glob(file.path(dir, "../plots/*_multipanel.pdf"))
prop_table <- fread(prop_table_path)
sample_cols <- colnames(prop_table)[colnames(prop_table) != "signature"]
ith_cols <- str_detect(sample_cols, "patient")
patients <- str_extract(sample_cols[ith_cols], "(?<=patient_?)[0-9]+")
site_ids <- str_replace_all(sample_cols[ith_cols], "patient_?[0-9]+_", "")
FROM <- c("rectum_site_1", "rectum_site_2", "rectum_site_3", "rectum_site_4",
"pelvis_site_1", "pelvis_site_2", "cul_de_sac_site_1")
TO <- c("rectum_site_1", "rectum_site_1", "rectum_site_2", "rectum_site_2",
"pelvis_site_1", "pelvis_site_1", "cul-de-sac_site_1")
site_ids_fixed <- mapvalues(site_ids, from = FROM, to = TO)
df <- data.frame(patient_id = patients, site_id = site_ids_fixed, old_id = sample_cols[ith_cols])
if (length(df[with(df, patient_id == "11" & site_id == "left_ovary_site_2"),
]$site_id) > 0) {
df[with(df, patient_id == "11" & site_id == "left_ovary_site_2"), ]$site_id <- "left_ovary_site_3"
}
sample_subset <- unique(subset(sample_table, select = c("condensed_id",
"patient_id", "site_id")))
df <- plyr::join(df, sample_subset)
if (nrow(df) > 0) {
df$is_ancestral <- 0
df$is_ancestral[df$site_id == "ancestral"] <- 1
}
# colnames(prop_table)[ith_cols] <- make.unique(df$condensed_id)
prop_final <- melt(prop_table, id.vars = c("signature"), measure.vars = sample_cols,
variable.name = "old_id", value.name = "proportion")
if (external) {
prop_final_merged <- plyr::join(df, prop_final, by = c("old_id"), type = "full")
} else {
prop_final_merged <- plyr::join(df, prop_final, by = c("old_id"), type = "inner")
}
prop_final_merged <- prop_final_merged %>% mutate(new_id = ifelse(site_id %in%
c("ancestral", "residual"), yes = paste(patient_id, site_id, sep = "_"),
no = condensed_id))
na_idx <- is.na(prop_final_merged$new_id)
prop_final_merged$new_id[na_idx] <- as.character(prop_final_merged$old_id[na_idx])
result <- list(prop = prop_final_merged, sigplot = sigs_path)
return(result)
}
sum_total_variant_counts <- function(variant_sample) {
variant_sample %>% group_by(condensed_id, patient_id) %>% summarise(snv_count = sum(snv_count),
bkpt_count = sum(bkpt_count))
}
compute_mutsig_counts <- function(df) {
df <- df %>% mutate(is_sv = as.numeric(str_detect(signature, "^SV")), sigcount = ifelse(as.character(site_id) ==
"ancestral", yes = ifelse(is_sv, yes = ancestral_bkpt * proportion,
no = ancestral_snv * proportion), no = ifelse(is_sv, yes = bkpt_count *
proportion, no = snv_count * proportion)))
return(df)
}
create_mutsig_matrix <- function(df, col = "proportion") {
mat <- acast(df, new_id ~ signature, fun.aggregate = mean, value.var = col)
return(mat)
}
variant_res <- summarize_variant_counts(master_variant_file, master_breakpoint_file,
db_path)
Read 5.9% of 682516 rows
Read 682516 rows and 9 (of 9) columns from 0.026 GB file in 00:00:03
variant_sample <- variant_res$sample
variant_patient <- variant_res$patient
variant_sample_sum <- sum_total_variant_counts(variant_sample)
ihc_table <- fread(ihc_table_path)
ihc_table_subset <- subset(ihc_table, select = c("condensed_id", tiltypes))
ihc_table_slide <- fread(ihc_table_slide_path)
Sample level
Results
props_sample <- read_mutsig_output(mmctm_sample_result_dir, samples)
props_sample_prop <- subset(props_sample$prop, patient_id != "11")
props_sample_mat <- create_mutsig_matrix(props_sample_prop, col = "proportion")
props_sample_mat_scaled <- clip_values(scale(props_sample_mat), 2, -2)
sample_heat <- pheatmap(props_sample_mat_scaled, fontsize_row = 5, clustering_distance_rows = "correlation",
clustering_method = "ward.D2")
props_sample_merged <- Reduce(function(x, y) plyr::join(x, y), list(props_sample_prop,
variant_sample_sum, variant_patient, ihc_table_subset))
props_sample_merged <- compute_mutsig_counts(props_sample_merged)
COL <- "E_CD8_density"
measure <- "sigcount"
pvals <- setNames(ddply(props_sample_merged, .(signature), function(x) {
df <- as.data.frame(x)
df <- df[!is.na(df[, COL]), ]
corres <- summary(lmer(as.formula(paste0(measure, " ~ ", COL, " + (1|patient_id)")),
df))
pval <- unname(corres$coefficients[, 5][2])
eq <- substitute(italic(P) == p, list(p = format(pval, digits = 3)))
return(as.character(as.expression(eq)))
}), c("signature", "p.value"))
ggplot(props_sample_merged, aes_string(x = COL, y = measure)) + geom_point(aes(colour = patient_id)) +
scale_colour_manual(values = pal_patient) + facet_wrap(~signature, scales = "free") +
geom_text(data = pvals, aes(x = Inf, y = Inf, label = p.value), hjust = 1.1,
vjust = 1.5, size = 3, parse = TRUE) + theme_bw()
# x <- Reduce(function(x,y) merge(x,y, by=c('condensed_id')),
# list(rownames_to_column(as.data.frame(t(prop_matrix)), var =
# 'condensed_id'), ihc_table_subset)) x$patient_id <- map_id(x$condensed_id,
# from = 'condensed_id', to='patient_id', db_path) x <- subset(x, patient_id
# != '11') cols <- colnames(x)[!colnames(x) %in% c('condensed_id',
# 'patient_id')] rmat <- matrix(nrow=length(cols),ncol=length(cols)) pmat <-
# matrix(nrow=length(cols),ncol=length(cols)) for (i in 1:length(cols)) {
# for (j in i:length(cols)) { if (i != j) { col1 <- cols[i] col2 <- cols[j]
# formula <- paste0('`', col2, '`', '~', '`', col1, '`', '+ (1|patient_id)',
# sep=' ') res <- summary(lmer(formula, x)) pval <- tryCatch({
# unname(res$coefficients[,5][2]) }, error = function(e) { NA }) r <-
# unname(res$coefficients[,1][2]) rmat[i,j] <- rmat[j,i] <- r pmat[i,j] <-
# pmat[j,i] <- pval } else { rmat[i,j] <- 1 pmat[i,j] <- NA } } }
# resmat <- log10(pmat)*-sign(rmat) resmat[resmat == Inf] <- NA
# rownames(resmat) <- colnames(resmat) <- cols pheatmap(resmat,
# display_numbers = signif(pmat, 3), cluster_rows = FALSE, cluster_cols =
# FALSE, fontsize = 5)
# xmat <- x %>% select(-one_of('condensed_id', 'patient_id')) cormat <-
# corr.test(xmat, method='spearman', adjust='fdr') pheatmap(cormat$r,
# display_numbers = signif(cormat$p, 3), fontsize = 5)
Correlation matrices
ihc_mat <- as.data.frame(ihc_table_subset %>% dplyr::select(-condensed_id))
rownames(ihc_mat) <- ihc_table_subset$condensed_id
# TOTAL_TIL_COLS <- c('T_CD8_density', 'T_CD4_density', 'T_CD20_density',
# 'T_Plasma_density') ihc_mat <- subset(ihc_mat, select=TOTAL_TIL_COLS)
compute_ihc_mutsig_cor <- function(ihc_mat, mutsig_mat, patient_summary = FALSE,
ancestral = FALSE) {
if (patient_summary) {
ihc_mat <- rownames_to_column(as.data.frame(ihc_mat), "id")
mutsig_mat <- rownames_to_column(as.data.frame(mutsig_mat), "id")
if (ancestral) {
mutsig_mat <- subset(mutsig_mat, str_detect(id, "ancestral"))
} else {
mutsig_mat <- subset(mutsig_mat, !str_detect(id, "ancestral"))
}
ihc_mat$patient_id <- str_extract(ihc_mat$id, "^[0-9]+")
mutsig_mat$patient_id <- str_extract(mutsig_mat$id, "^[0-9]+")
ihc_mat <- ihc_mat %>% dplyr::select(-id) %>% group_by(patient_id) %>%
summarise_each(funs(mean(., na.rm = TRUE))) %>% column_to_rownames(var = "patient_id")
mutsig_mat <- mutsig_mat %>% dplyr::select(-id) %>% group_by(patient_id) %>%
summarise_each(funs(mean(., na.rm = TRUE))) %>% column_to_rownames(var = "patient_id")
ihc_mat <- as.matrix(ihc_mat)
mutsig_mat <- as.matrix(mutsig_mat)
}
intersect_samples <- intersect(rownames(mutsig_mat), rownames(ihc_mat))
mutsig <- mutsig_mat[intersect_samples, , drop = FALSE]
ihc <- ihc_mat[intersect_samples, , drop = FALSE]
pmat <- matrix(nrow = ncol(mutsig), ncol = ncol(ihc))
rmat <- matrix(nrow = ncol(mutsig), ncol = ncol(ihc))
rownames(pmat) <- rownames(rmat) <- colnames(mutsig)
colnames(pmat) <- colnames(rmat) <- colnames(ihc)
for (i in 1:ncol(mutsig)) {
for (j in 1:ncol(ihc)) {
corres <- cor.test(mutsig[, i], ihc[, j], method = "spearman")
pmat[i, j] <- corres$p.value
rmat[i, j] <- corres$estimate
}
}
return(list(p = pmat, r = rmat))
}
For adjusted p-values just run p.adjust.mat
on the p-value labels.
Sample-level
ihc_sample_cor <- compute_ihc_mutsig_cor(ihc_mat, props_sample_mat)
pheatmap(ihc_sample_cor$r, display_numbers = signif(ihc_sample_cor$p, 3))
ihc_sample_cor_summary <- compute_ihc_mutsig_cor(ihc_mat, props_sample_mat,
patient_summary = TRUE)
pheatmap(ihc_sample_cor_summary$r, display_numbers = signif(ihc_sample_cor_summary$p,
3))
Ancestral-descendant level
ihc_ad_cor <- compute_ihc_mutsig_cor(ihc_mat, props_ad_mat)
pheatmap(ihc_ad_cor$r, display_numbers = signif(ihc_ad_cor$p, 3))
ihc_ad_cor_summary <- compute_ihc_mutsig_cor(ihc_mat, props_ad_mat, patient_summary = TRUE)
pheatmap(ihc_ad_cor_summary$r, display_numbers = signif(ihc_ad_cor_summary$p,
3))
ihc_ad_cor_summary_ancestral <- compute_ihc_mutsig_cor(ihc_mat, props_ad_mat,
patient_summary = TRUE, ancestral = TRUE)
pheatmap(ihc_ad_cor_summary_ancestral$r, display_numbers = signif(ihc_ad_cor_summary_ancestral$p,
3))
Note: p-values shown are uncorrected. Patient-summarized correlations are insignificant after FDR correction.
Cluster-level analysis
Finding correlations at the level of individual signatures can be difficult. Even moreso because some published signatures are combinations of these signatures – e.g. SV-3 and SV-6 are both foldback signatures in the ancestral-descendant analysis.
Hence, we can look at the level of clusters from our heatmaps.
make_cluster_frame <- function(clusters) {
clusts <- rownames_to_column(as.data.frame(clusters), "new_id")
colnames(clusts)[2] <- "cluster"
clusts$cluster <- factor(clusts$cluster)
return(clusts)
}
NCLUST <- 2
selected_cols <- c("new_id", "condensed_id", "patient_id", "old_id", "cluster",
tiltypes)
Sample-level
clusters <- cutree(sample_heat$tree_row, NCLUST)
sample_clusts <- make_cluster_frame(clusters)
props_sample_merged_clusts <- join(props_sample_merged, sample_clusts)
sample_df <- unique(subset(props_sample_merged_clusts, select = selected_cols))
sample_df_melted <- melt(sample_df, id.vars = colnames(sample_df)[!colnames(sample_df) %in%
tiltypes], measure.vars = tiltypes, variable.name = "tiltype", value.name = "density")
pvals <- setNames(ddply(sample_df_melted, .(tiltype), function(x) {
df <- as.data.frame(x)
corres <- wilcox.test(density ~ cluster, df)
pval <- corres$p.value
eq <- substitute(italic(P) == p, list(p = format(pval, digits = 3)))
return(as.character(as.expression(eq)))
}), c("tiltype", "p.value"))
ggplot(sample_df_melted, aes(x = cluster, y = density)) + geom_boxplot() + theme_bw() +
theme_Publication() + geom_jitter(aes(colour = patient_id), position = position_jitter(width = 0.2,
height = 0)) + scale_color_manual(values = pal_patient) + facet_wrap(~tiltype,
scales = "free") + geom_text(data = pvals, aes(x = Inf, y = Inf, label = p.value),
hjust = 1.1, vjust = 1.5, size = 3, parse = TRUE)
diversity_column <- "observedDiversity_mean"
diversity_files <- list(tcr = tcr_diversity_file, bcr = bcr_diversity_file)
diversity <- lapply(names(diversity_files), function(segment) {
f <- diversity_files[[segment]]
xcr_diversity <- read_xcr_diversity_file(f, db_path)
df <- subset(xcr_diversity, select = c("condensed_id", "patient_id", diversity_column))
colnames(df) <- mapvalues(colnames(df), from = diversity_column, to = segment)
return(df)
})
names(diversity) <- names(diversity_files)
xcr_diversity <- Reduce(plyr::join, diversity)
XCR_VARS <- c("tcr", "bcr")
molsubtypes <- fread(molecular_subtype_file)
molsubtypes <- setNames(molsubtypes, c("condensed_id", "subtype"))
proportion_subclonality <- fread(proportion_subclonality_file)
proportion_subclonality_subset <- subset(proportion_subclonality, select = c("condensed_id",
"proportion_subclonal"))
exprs <- fread(nanostring_data_path)
labels <- fread(nanostring_annotations_path)
celltype_matrix <- create_celltype_matrix(exprs, labels, db_path)
pathway_matrix <- create_pathway_matrix(exprs, labels, db_path)
celltype_df <- rownames_to_column(as.data.frame(t(celltype_matrix)), var = "condensed_id")
pathway_df <- rownames_to_column(as.data.frame(t(pathway_matrix)), var = "condensed_id")
NANOSTRING_VARS <- c(rownames(celltype_matrix), rownames(pathway_matrix))
colnames(sample_clusts) <- mapvalues(colnames(sample_clusts), "new_id", "condensed_id")
ihc_stabilize_subset <- stabilize_ihc_variances(ihc_table_slide, ihc_table,
tiltypes)
data_matrices <- list(sample_clusts, ihc_stabilize_subset, xcr_diversity, celltype_df,
pathway_df, proportion_subclonality_subset)
data_matrices <- lapply(data_matrices, function(x) {
x %>% dplyr::select(-one_of("patient_id"))
})
combined_df <- Reduce(function(x, y) plyr::join(x, y, type = "full"), data_matrices)
combined_df <- subset(combined_df, !is.na(cluster) & !condensed_id %in% c("7_BrnM"))
combined_mat <- subset(combined_df, select = -c(condensed_id, cluster))
rownames(combined_mat) <- combined_df$condensed_id
# ref_dendrogram <- prune(as.dendrogram(sample_heat$tree_row), '7_BrnM')
sample_order <- sample_heat$tree_row$labels[sample_heat$tree_row$order]
sample_order <- intersect(sample_order, rownames(combined_mat))
cluster_annotations <- subset(combined_df, select = c(condensed_id, cluster))
SIGS <- c("SV-3", "SNV-5", "SV-1", "SV-2", "SNV-1")
sig_annotations <- rownames_to_column(data.frame(props_sample_mat[, SIGS]),
var = "condensed_id")
## Take logarithms to ~variance stabilize
variant_count_annotations <- variant_sample %>% group_by_(.dots = "condensed_id") %>%
summarise(snv_count = log(sum(snv_count)), bkpt_count = log(sum(bkpt_count))) %>%
subset(select = c("condensed_id", "snv_count", "bkpt_count"))
row_annotations <- Reduce(plyr::join, list(cluster_annotations, molsubtypes,
sig_annotations, variant_count_annotations))
row_annotations <- row_annotations %>% column_to_rownames(var = "condensed_id")
combined_mat_scaled <- scale(combined_mat)
# hcs <- lapply(unique(combined_df$cluster), function(clust) { ids <-
# subset(combined_df, cluster == clust)$condensed_id dists <-
# dist(combined_mat_scaled[ids,], method = 'euclidean') #dists <-
# proxy::dist(combined_mat_scaled[ids,], method = function(x,y)
# pairwise_dist(x,y,method='canberra')) hclust(dists, method = 'ward.D') })
# min_height <- max(sapply(hcs, function(x) max(unique(cophenetic(x)))))*1.1
# hc <- Reduce(function(x,y) merge(as.dendrogram(x),as.dendrogram(y),height
# = min_height), hcs) hc <- as.dendrogram(ref_dendrogram) hc <-
# as.dendrogram(sample_heat2$tree_row) ha <-
# HeatmapAnnotation(row_annotations[rownames(combined_mat_scaled),],
# which='row') Heatmap(clip_values(combined_mat_scaled, 2, -2), cluster_rows
# = hc2, cluster_columns = TRUE, split = 2, clustering_method_columns =
# 'ward.D2') + ha
pheatmap(clip_values(combined_mat_scaled, 2, -2)[sample_order, ], cluster_rows = FALSE,
cluster_cols = TRUE, annotation_row = row_annotations, fontsize = 6)
What we can see is:
- H-HRD cluster is relatively homogeneous.
- Looks like there are two clusters within the H-FBI group – one characterized by high levels of immune activity (cytotoxicity, etc.) and one characterized by low levels.
This is a pretty significant finding.
Additionally, an interesting thing is that the patients/samples with the highest proportions of foldbacks – patients 2, 3, and 9 – are actually the ones with high immune response in the foldback group! Suggestive that perhaps foldback inversions can create neoepitopes. Of course very preliminary and low sample size though.
If you’re wondering why there’s no dendrogram on the vertical axis it’s because the plotting functions I’m currently using don’t allow for self-specified dendrograms. Trying to make one that lets me do so but it’s taking a bit of acrobatics and I’ve wasted a lot of time already …
To see ICGC validation, go to that section. I’ll add a link later …
TCR/BCR diversity
combined_df$patient_id <- map_id(combined_df$condensed_id, from = "condensed_id",
to = "patient_id", db_path)
combined_df_xcr <- melt(combined_df, id.vars = c("condensed_id", "patient_id",
"cluster"), measure.vars = XCR_VARS, variable.name = "type", value.name = "value")
pvals <- setNames(ddply(combined_df_xcr, .(type), function(x) {
df <- as.data.frame(x)
corres <- wilcox.test(value ~ cluster, df)
pval <- corres$p.value
eq <- substitute(italic(P) == p, list(p = format(pval, digits = 3)))
return(as.character(as.expression(eq)))
}), c("type", "p.value"))
ggplot(combined_df_xcr, aes(x = cluster, y = value)) + geom_boxplot() + theme_bw() +
theme_Publication() + geom_jitter(aes(colour = patient_id), position = position_jitter(width = 0.2,
height = 0)) + scale_color_manual(values = pal_patient) + facet_wrap(~type,
scales = "free") + geom_text(data = pvals, aes(x = Inf, y = Inf, label = p.value),
hjust = 1.1, vjust = 1.5, size = 3, parse = TRUE)
Celltypes and pathways
combined_df$patient_id <- map_id(combined_df$condensed_id, from = "condensed_id",
to = "patient_id", db_path)
combined_df_nanostring <- melt(combined_df, id.vars = c("condensed_id", "patient_id",
"cluster"), measure.vars = NANOSTRING_VARS, variable.name = "type", value.name = "value")
pvals <- setNames(ddply(combined_df_nanostring, .(type), function(x) {
df <- as.data.frame(x)
corres <- wilcox.test(value ~ cluster, df)
pval <- corres$p.value
eq <- substitute(italic(P) == p, list(p = format(pval, digits = 3)))
return(as.character(as.expression(eq)))
}), c("type", "p.value"))
ggplot(combined_df_nanostring, aes(x = cluster, y = value)) + geom_boxplot() +
theme_bw() + theme_Publication() + geom_jitter(aes(colour = patient_id),
position = position_jitter(width = 0.2, height = 0)) + scale_color_manual(values = pal_patient) +
facet_wrap(~type, scales = "free") + geom_text(data = pvals, aes(x = Inf,
y = Inf, label = p.value), hjust = 1.1, vjust = 1.5, size = 3, parse = TRUE)
Ancestral-descendant level
clusters <- cutree(patient_heat$tree_row, NCLUST)
patient_clusts <- make_cluster_frame(clusters)
props_ad_merged_clusts <- join(props_ad_merged, patient_clusts)
patient_df <- unique(subset(props_ad_merged_clusts, select = selected_cols))
patient_df_melted <- melt(patient_df, id.vars = colnames(patient_df)[!colnames(patient_df) %in%
tiltypes], measure.vars = tiltypes, variable.name = "tiltype", value.name = "density")
pvals <- setNames(ddply(patient_df_melted, .(tiltype), function(x) {
df <- as.data.frame(x)
corres <- wilcox.test(density ~ cluster, df)
pval <- corres$p.value
eq <- substitute(italic(P) == p, list(p = format(pval, digits = 3)))
return(as.character(as.expression(eq)))
}), c("tiltype", "p.value"))
ggplot(patient_df_melted, aes(x = cluster, y = density)) + geom_boxplot() +
theme_bw() + theme_Publication() + geom_jitter(aes(colour = patient_id),
position = position_jitter(width = 0.2, height = 0)) + scale_color_manual(values = pal_patient) +
facet_wrap(~tiltype, scales = "free") + geom_text(data = pvals, aes(x = Inf,
y = Inf, label = p.value), hjust = 1.1, vjust = 1.5, size = 3, parse = TRUE)
Hence, the foldback group (cluster 2) has lower CD8+ and CD4+ densities than the HRD group, as expected.
A caveat is that p-values are computed with Wilcoxon, irrespective of patient number. We can’t really opt for a nonparametric nested ranks test because very few patients (4) are actually present in both clusters.
Ancestral analysis
Ancestral variants may have different properties from descendant variants.
Sample-specific
Here we’ll just naively allow for subclonal variants to be counted multiple times.
props_ad_merged$is_ancestral <- as.factor(props_ad_merged$is_ancestral)
pvals <- setNames(ddply(props_ad_merged, .(signature), function(x) {
df <- as.data.frame(x)
corres <- wilcox.test(proportion ~ is_ancestral, df)
pval <- corres$p.value
eq <- substitute(italic(P) == p, list(p = format(pval, digits = 3)))
return(as.character(as.expression(eq)))
}), c("signature", "p.value"))
ggplot(props_ad_merged, aes(x = is_ancestral, y = proportion)) + geom_boxplot() +
geom_jitter(aes(colour = patient_id), position = position_jitter(width = 0.2,
height = 0)) + theme_bw() + theme_Publication() + scale_colour_manual(values = pal_patient) +
facet_wrap(~signature, scales = "free") + geom_text(data = pvals, aes(x = Inf,
y = Inf, label = p.value), hjust = 1.1, vjust = 1.5, size = 3, parse = TRUE)
P-values are uncorrected.
Unsurprisingly, ancestral samples have significantly more of SNV-4, the age signature. Insignificant, but they also have less SV-1, which is one of the BRCA’s (small deletions).
Union
Here we’ll actually take the union of subclonal variants for each patient so we don’t overcount.
props_patient_ad_merged$is_ancestral <- as.factor(props_patient_ad_merged$is_ancestral)
pvals <- setNames(ddply(props_patient_ad_merged, .(signature), function(x) {
df <- as.data.frame(x)
corres <- wilcox.test(proportion ~ is_ancestral, df, paired = TRUE)
pval <- corres$p.value
eq <- substitute(italic(P) == p, list(p = format(pval, digits = 3)))
return(as.character(as.expression(eq)))
}), c("signature", "p.value"))
ggplot(props_patient_ad_merged, aes(x = is_ancestral, y = proportion)) + geom_boxplot() +
geom_jitter(aes(colour = patient_id), position = position_jitter(width = 0.2,
height = 0)) + theme_bw() + theme_Publication() + scale_colour_manual(values = pal_patient) +
facet_wrap(~signature, scales = "free") + geom_text(data = pvals, aes(x = Inf,
y = Inf, label = p.value), hjust = 1.1, vjust = 1.5, size = 3, parse = TRUE)
Aside from age and HRD which were described in McPherson et al., there’s additionally the SV-3 signature – a translocation signature. Implying that translocations are early (ancestral) events in HGSC – perhaps this is interesting? I have to do a literature search.
anc_desc_patient <- dcast(props_patient_ad_merged, formula = patient_id + signature ~
is_ancestral, value.var = "proportion")
pvals <- setNames(ddply(anc_desc_patient, .(signature), function(x) {
df <- as.data.frame(x)
corres <- with(df, cor.test(`0`, `1`, method = "spearman"))
pval <- corres$p.value
eq <- substitute(italic(P) == p, list(p = format(pval, digits = 3)))
return(as.character(as.expression(eq)))
}), c("signature", "p.value"))
ggplot(anc_desc_patient, aes(x = `1`, y = `0`)) + geom_point() + geom_point(aes(colour = patient_id)) +
theme_bw() + theme_Publication() + scale_colour_manual(values = pal_patient) +
facet_wrap(~signature, scales = "free") + geom_text(data = pvals, aes(x = Inf,
y = Inf, label = p.value), hjust = 1.1, vjust = 1.5, size = 3, parse = TRUE)
ICGC validation
FBI subclusters
specimen_data <- fread(icgc_specimen_file)
icgc_subtypes <- fread(icgc_subtype_file)
icgc_expr_mat <- fread(icgc_expr_mat_file)
icgc_expr_df <- icgc_expr_mat %>% as.data.frame %>% column_to_rownames("icgc_donor_id") %>%
t %>% as.data.frame %>% rownames_to_column("Name")
icgc_celltype_matrix <- create_pathway_matrix(icgc_expr_df, labels, db_path,
convert_ids = FALSE)
icgc_immune <- rownames_to_column(as.data.frame(t(icgc_celltype_matrix)), var = "icgc_donor_id")
# cytotoxic_markers <- c('PRF1', 'GZMA', 'HLA-A', 'HLA-B', 'HLA-C', 'GZMK',
# 'GZMM', 'GZMH', 'GNLY', 'GZMB') icgc_immune <-
# as.data.frame(subset(icgc_expr_mat, select=c('icgc_donor_id',
# cytotoxic_markers))) for (i in 2:ncol(icgc_immune)) { icgc_immune[,i] <-
# log10(icgc_immune[,i]) }
props_ov_combined <- read_mutsig_output(mmctm_ov_combined_result_dir, samples,
external = TRUE)
props_ov_combined_prop <- props_ov_combined$prop
props_ov_combined_mat <- create_mutsig_matrix(props_ov_combined_prop, col = "proportion")
# props_ov_combined_mat_scaled <- as.data.frame(apply(props_ov_combined_mat,
# 2, function(x) { #x <- logit(x) (x-median(x,na.rm=TRUE))/mad(x,na.rm=TRUE)
# }))
props_ov_combined_mat_scaled <- scale(props_ov_combined_mat)
# props_ov_combined_mat_scaled <- clip_values(props_ov_combined_mat_scaled,
# 3, -3) clip_values(scale(props_ov_combined_mat), 2, -2)
ov_combined_heat <- pheatmap(props_ov_combined_mat_scaled, fontsize_row = 6,
clustering_distance_rows = "correlation", clustering_method = "ward.D2")
NCLUST <- 3
clusters <- cutree(ov_combined_heat$tree_row, NCLUST)
ov_combined_clusts <- make_cluster_frame(clusters)
colnames(icgc_immune) <- mapvalues(colnames(icgc_immune), "icgc_donor_id", "new_id")
data_matrices <- list(ov_combined_clusts, icgc_immune)
data_matrices <- lapply(data_matrices, function(x) {
x %>% dplyr::select(-one_of("patient_id"))
})
combined_df <- Reduce(function(x, y) plyr::join(x, y, type = "inner"), data_matrices)
# combined_df <- subset(combined_df, cluster == 1)
combined_mat <- subset(combined_df, select = -c(new_id, cluster))
rownames(combined_mat) <- combined_df$new_id
sample_order <- ov_combined_heat$tree_row$labels[ov_combined_heat$tree_row$order]
sample_order <- intersect(sample_order, rownames(combined_mat))
cluster_annotations <- subset(combined_df, select = c(new_id, cluster))
SIGS <- c("SV-8", "SNV-1", "SV-4", "SV-7", "SV-5", "SNV-7", "SV-1")
sig_annotations <- rownames_to_column(data.frame(props_ov_combined_mat[, SIGS],
check.names = FALSE), var = "new_id")
subtype_annotations <- subset(icgc_subtypes, select = c("icgc_donor_id", "subtype",
"nmf_subtype"))
colnames(subtype_annotations)[1] <- "new_id"
row_annotations <- Reduce(plyr::join, list(cluster_annotations, sig_annotations,
subtype_annotations))
row_annotations <- row_annotations %>% column_to_rownames(var = "new_id")
select_rows <- rownames(subset(row_annotations, `SV-4` < 1))
notx <- subset(specimen_data, str_detect(specimen_type, "Primary") & specimen_donor_treatment_type ==
"no treatment")$icgc_donor_id
sample_order <- intersect(sample_order, select_rows)
sample_order <- intersect(sample_order, notx)
# order_fbi <- order(row_annotations[rownames(combined_mat_scaled),]$`SV-8`)
# combined_mat_scaled <- scale(combined_mat)
combined_mat_scaled <- as.data.frame(apply(combined_mat, 2, function(x) (x -
median(x, na.rm = TRUE))/mad(x, na.rm = TRUE)))
combined_mat_scaled <- combined_mat_scaled[sample_order, ]
other_mat <- props_ov_combined_mat_scaled[sample_order, ]
pheatmap(clip_values(cbind(combined_mat_scaled, other_mat), 2, -2), cluster_rows = FALSE,
cluster_cols = TRUE, annotation_row = row_annotations, fontsize = 6, clustering_distance_cols = "correlation",
clustering_method = "ward.D2")
# rowmean <- rowMeans(combined_mat_scaled) a <- merge(row_annotations,
# as.data.frame(rowmean), by='row.names') ggplot(subset(a, cluster != 1),
# aes(x=rowmean > median(rowmean), y = `SNV-7`)) + geom_boxplot() +
# theme_bw()
clusters <- setNames(cluster_annotations, c("icgc_donor_id", "cluster"))
Differential expression
icgc_expr_melted <- fread(icgc_expr_raw_file)
Read 0.0% of 3709596 rows
Read 11.6% of 3709596 rows
Read 23.5% of 3709596 rows
Read 35.3% of 3709596 rows
Read 47.4% of 3709596 rows
Read 59.3% of 3709596 rows
Read 71.2% of 3709596 rows
Read 83.0% of 3709596 rows
Read 95.2% of 3709596 rows
Read 3709596 rows and 15 (of 15) columns from 0.484 GB file in 00:00:16
icgc_expr_casted_matrix <- expression_df_to_matrix(icgc_expr_melted, summarize_over = "icgc_donor_id",
measure_var = "raw_read_count")
icgc_counts <- icgc_expr_casted_matrix %>% column_to_rownames(var = "icgc_donor_id")
icgc_counts_filtered <- icgc_counts[clusters$icgc_donor_id, ]
icgc_counts_filtered <- icgc_counts_filtered %>% t %>% as.data.frame
dge <- DGEList(counts = icgc_counts_filtered)
dge <- calcNormFactors(dge)
library_sizes <- colSums(icgc_counts_filtered)
library_sizes
DO46325 DO46327 DO46328 DO46329 DO46330 DO46331 DO46332
78792862 106156028 146714663 100782140 66648443 94187980 128035792
DO46333 DO46334 DO46336 DO46338 DO46340 DO46342 DO46344
70763442 87776088 106659467 67264480 72586319 91076278 97575765
DO46346 DO46350 DO46352 DO46354 DO46356 DO46358 DO46360
78219082 65895620 95169618 92438598 46265120 90175214 96687938
DO46362 DO46364 DO46366 DO46368 DO46370 DO46372 DO46374
95856331 101399457 90792124 97051216 100193800 92056106 116039147
DO46376 DO46378 DO46380 DO46382 DO46384 DO46386 DO46388
81664713 159721278 90029262 80192737 80784914 88436986 95577226
DO46390 DO46392 DO46394 DO46396 DO46398 DO46400 DO46402
88612126 91723742 91209587 87119354 79391516 93620832 94446982
DO46404 DO46408 DO46412 DO46448 DO46493 DO46551 DO46560
97746414 91333342 95814730 88596500 123302138 84121097 109112881
DO46561 DO46566 DO46568 DO46571 DO46576 DO46581 DO46586
99873236 79993818 95909367 75957744 91988944 98993304 95877151
DO46588 DO46591 DO46597 DO46602 DO46606 DO46611
78452166 113011642 94472903 96109428 98892638 61422278
max(library_sizes)/min(library_sizes)
[1] 3.452304
design <- model.matrix(~0 + factor(clusters$cluster))
colnames(design) <- paste0("clust", 1:NCLUST)
contrast.matrix <- makeContrasts(clust1 - clust3, clust2 - (clust1 + clust3)/2,
clust2 - clust1, levels = design)
limma-trend
We straddle close to threshold for doing this but let’s do it anyways.
fit_trend <- rnaseq_DE_initial_fit(dge, design, type = "limma_trend")
fit_trend_contrasts <- contrasts.fit(fit_trend, contrast.matrix)
fit_trend_contrasts <- eBayes(fit_trend_contrasts)
hrd_fbi_genes <- topTable(fit_trend_contrasts, coef = "clust2 - (clust1 + clust3)/2",
adjust.method = "BH", number = Inf, sort = "p")
fbi_immune_genes <- topTable(fit_trend_contrasts, coef = "clust1 - clust3",
number = Inf, sort = "p")
hrd_fbi_highimmune_genes <- topTable(fit_trend_contrasts, coef = "clust2 - clust1",
number = Inf, sort = "p")
hrd_fbi_pathways <- pathway_analysis(hrd_fbi_genes, gs_type = "go")
fbi_immune_pathways <- pathway_analysis(fbi_immune_genes, gs_type = "go")
hrd_fbi_highimmune_pathways <- pathway_analysis(hrd_fbi_highimmune_genes, gs_type = "go")
HRD vs. FBI samples (clust2 vs. clust1+clust3)
datatable(hrd_fbi_pathways, extensions = "Buttons", options = list(pageLength = 5,
scrollX = TRUE, dom = "Bfrtip", buttons = c("copy", "csv", "excel", "pdf",
"print")))
High immune FBI vs. low immune FBI (clust1 vs. clust3)
datatable(fbi_immune_pathways, extensions = "Buttons", options = list(pageLength = 5,
scrollX = TRUE, dom = "Bfrtip", buttons = c("copy", "csv", "excel", "pdf",
"print")))
HRD vs. high immune FBI (clust2 vs. clust1)
datatable(hrd_fbi_highimmune_pathways, extensions = "Buttons", options = list(pageLength = 5,
scrollX = TRUE, dom = "Bfrtip", buttons = c("copy", "csv", "excel", "pdf",
"print")))
voom
We have greater than 3-fold variability in library size, so we’re better off using the voom method for DE analysis.
fit_voom <- rnaseq_DE_initial_fit(dge, design, type = "voom")
fit_voom_contrasts <- contrasts.fit(fit_voom, contrast.matrix)
fit_voom_contrasts <- eBayes(fit_voom_contrasts)
hrd_fbi_genes <- topTable(fit_voom_contrasts, coef = "clust2 - (clust1 + clust3)/2",
adjust.method = "BH", number = Inf, sort = "p")
fbi_immune_genes <- topTable(fit_voom_contrasts, coef = "clust1 - clust3", number = Inf,
sort = "p")
hrd_fbi_highimmune_genes <- topTable(fit_voom_contrasts, coef = "clust2 - clust1",
number = Inf, sort = "p")
hrd_fbi_pathways <- pathway_analysis(hrd_fbi_genes, gs_type = "kegg")
fbi_immune_pathways <- pathway_analysis(fbi_immune_genes, gs_type = "kegg")
hrd_fbi_highimmune_pathways <- pathway_analysis(hrd_fbi_highimmune_genes, gs_type = "kegg")
# pv.out.list <- sapply(path.ids2, function(pid) pathview( gene.data =
# exp.fc, pathway.id = pid, species = 'hsa', out.suffix=out.suffix))
Note: BRCA1, BRCA2, and RPA are also downregulated in the HRD group – the entire HR-associated gene cluster isn’t significantly downregulated though.
One of the key reasons why the entire HR pathway isn’t significantly downregulated is because PolQ is part of it and PolQ is upregulated. Of note, PolQ promotes MMEJ, providing further evidence for MMEJ activity in FBI tumours.
HRD vs. FBI samples (clust2 vs. clust1+clust3)
datatable(hrd_fbi_pathways, extensions = "Buttons", options = list(pageLength = 5,
scrollX = TRUE, dom = "Bfrtip", buttons = c("copy", "csv", "excel", "pdf",
"print")))
High immune FBI vs. low immune FBI (clust1 vs. clust3)
datatable(fbi_immune_pathways, extensions = "Buttons", options = list(pageLength = 5,
scrollX = TRUE, dom = "Bfrtip", buttons = c("copy", "csv", "excel", "pdf",
"print")))
HRD vs. high immune FBI (clust2 vs. clust1)
datatable(hrd_fbi_highimmune_pathways, extensions = "Buttons", options = list(pageLength = 5,
scrollX = TRUE, dom = "Bfrtip", buttons = c("copy", "csv", "excel", "pdf",
"print")))
Some DNA repair genes
v <- voom(icgc_counts_filtered, design, normalize = "quantile", plot = FALSE)
icgc_expr_df_wide <- v$E %>% t %>% as.data.frame %>% rownames_to_column("icgc_donor_id")
icgc_expr_df_melted <- reshape2::melt(icgc_expr_df_wide, id.vars = c("icgc_donor_id"),
measure.vars = colnames(icgc_expr_df_wide)[2:ncol(icgc_expr_df_wide)], variable.name = "Name",
value.name = "expression")
icgc_expr_df_melted_labeled <- plyr::join(icgc_expr_df_melted, clusters)
REPAIR_GENES <- c("BRCA1", "BRCA2", "POLQ", "FEN1", "XRCC1", "PARP1", "LIG3")
icgc_expr_df_melted_labeled_filtered <- subset(icgc_expr_df_melted_labeled,
Name %in% REPAIR_GENES)
ggplot(subset(icgc_expr_df_melted_labeled_filtered, !is.na(cluster)), aes(x = cluster,
y = expression)) + geom_boxplot() + geom_jitter(position = position_jitter(width = 0.2,
height = 0)) + theme_bw() + theme_Publication() + facet_wrap(~Name, scales = "free")
Survival
icgc_clinical <- read_donor_specimen_survival(icgc_donor_file, icgc_specimen_file)
icgc_clinical_labeled <- plyr::join(clusters, icgc_clinical)
icgc_clinical_labeled$SurvObj <- with(icgc_clinical_labeled, Surv(donor_survival_time,
donor_vital_status == "deceased"))
Survival by FBI status …
simple_survival_analysis(SurvObj ~ cluster != 2, data = icgc_clinical_labeled)
Call:
survival::survdiff(formula = formula, data = data)
N Observed Expected (O-E)^2/E (O-E)^2/V
cluster != 2=FALSE 27 25 29.3 0.641 1.34
cluster != 2=TRUE 38 34 29.7 0.634 1.34
Chisq= 1.3 on 1 degrees of freedom, p= 0.246
simple_survival_analysis(SurvObj ~ cluster, data = icgc_clinical_labeled)
Call:
survival::survdiff(formula = formula, data = data)
N Observed Expected (O-E)^2/E (O-E)^2/V
cluster=1 21 18 15.8 0.293 0.413
cluster=2 27 25 29.3 0.641 1.344
cluster=3 17 16 13.8 0.344 0.473
Chisq= 1.3 on 2 degrees of freedom, p= 0.509
TCGA validation
While we can’t directly do mutation signature analysis with MMCTM on exome data (I suppose this is theoretically possible but we’d probably be restricted in the types of events we can call, like long SVs), we can look at the FBI-HLAMP finding from Yikan’s paper and see if that lines up with immune signatures.
tcga_expr_mat <- read_tcga_exprs(tcga_expr_mat_file)
Read 0.0% of 570 rows
Read 570 rows and 12497 (of 12497) columns from 0.121 GB file in 00:00:05
tcga_ov_annotation <- fread(tcga_ov_annotation_file)
# tcga_fbi_hlamp_proportions <- fread(tcga_fbi_hlamp_proportions_file)
tcga_expr_df <- tcga_expr_mat %>% as.data.frame %>% column_to_rownames("tcga_sample_id") %>%
t %>% as.data.frame %>% rownames_to_column("Name")
tcga_celltype_matrix <- create_pathway_matrix(tcga_expr_df, labels, db_path,
convert_ids = FALSE)
tcga_immune <- rownames_to_column(as.data.frame(t(tcga_celltype_matrix)), var = "tcga_sample_id")
mat <- tcga_immune %>% column_to_rownames("tcga_sample_id") %>% scale #%>% clip_values(2, -2)
row_annotations <- tcga_ov_annotation %>% as.data.frame %>% column_to_rownames("tcga_sample_id")
tcgaheat <- pheatmap(mat, annotation_row = row_annotations, cluster_rows = TRUE,
cluster_cols = TRUE, clustering_method = "ward.D", show_rownames = FALSE)
# tcga_clusters <-
# rownames_to_column(data.frame(cluster=factor(cutree(tcgaheat$tree_row,
# 2))), var = 'tcga_sample_id')
col <- "Cytotoxicity"
tcga_clusters <- rownames_to_column(data.frame(cluster = mat[, col] > median(mat[,
col])), "tcga_sample_id") %>% mutate(cluster = as.factor(as.numeric(cluster)))
# row_annotations <- plyr::join(tcga_ov_annotation, tcga_clusters) %>%
# as.data.frame %>% column_to_rownames('tcga_sample_id') pheatmap(mat,
# annotation_row = row_annotations, cluster_rows = TRUE, cluster_cols =
# TRUE, clustering_method = 'ward.D', show_rownames = FALSE)
df_merged <- plyr::join(tcga_immune, tcga_ov_annotation)
expr_measure_vars <- colnames(tcga_immune)[2:ncol(tcga_immune)]
df_merged_melted <- melt(df_merged, id.vars = c("tcga_sample_id", "Subgroup",
"MolecularSubtype", "BRCA.status"), measure.vars = expr_measure_vars, variable.name = "Measure",
value.name = "Expression")
pvals <- setNames(ddply(subset(df_merged_melted, !is.na(Subgroup)), .(Measure),
function(x) {
df <- as.data.frame(x)
testres <- kruskal.test(Expression ~ factor(Subgroup), data = df)
pval <- testres$p.value
eq <- substitute(italic(P) == p, list(p = format(pval, digits = 3)))
return(as.character(as.expression(eq)))
}), c("Measure", "p.value"))
ggplot(subset(df_merged_melted, !is.na(Subgroup)), aes(x = Subgroup, y = Expression)) +
geom_boxplot() + theme_bw() + facet_wrap(~Measure, scales = "free", ncol = 3) +
theme_Publication() + geom_text(data = pvals, aes(x = Inf, y = Inf, label = p.value),
hjust = 1.1, vjust = 1.5, size = 3, parse = TRUE)
P-values are uncorrected. So after correction, we aren’t going to get anything significant.
In conclusion, the immune subgroups constitute a new subgrouping independent of FBI-HLAMP.
I’ve also done correlation testing using the logR values that Yikan’s provided me (rather than the discrete groupings of no AMP, FBI-AMP low, and FBI-AMP high), but the correlations are very poor (rho values of ~ -0.05 or so, p-values of >=0.3).
Survival
tcga_clinical <- read_tcga_clinical(tcga_donor_file, type = "synapse2", filter = TRUE,
unique = FALSE)
tcga_clinical_labeled <- Reduce(function(x, y) plyr::join(x, y, type = "full"),
list(tcga_clusters, tcga_clinical, tcga_ov_annotation))
tcga_clinical_labeled$SurvObj <- with(tcga_clinical_labeled, Surv(ifelse(vital_status ==
"Dead", death_days_to, last_contact_days_to), vital_status == "Dead"))
# tcga_clinical_labeled <- subset(tcga_clinical_labeled,
# additional_drug_therapy == 'NO' & additional_immuno_therapy == 'NO' &
# additional_pharmaceutical_therapy == 'NO' & targeted_molecular_therapy ==
# 'NO' & radiation_therapy == 'NO') tcga_clinical_labeled <-
# subset(tcga_clinical_labeled, str_detect(tumor_stage, '^III'))
FBI-AMP colocalization
Let’s first stratify by Yikan’s groupings:
simple_survival_analysis(SurvObj ~ Subgroup, data = tcga_clinical_labeled)
Call:
survival::survdiff(formula = formula, data = data)
n=430, 147 observations deleted due to missingness.
N Observed Expected (O-E)^2/E (O-E)^2/V
Subgroup=FBI-AMP High 173 86 67.7 4.94 7.43
Subgroup=FBI-AMP Low 182 80 92.1 1.60 2.87
Subgroup=No AMP 75 43 49.2 0.77 1.03
Chisq= 7.4 on 2 degrees of freedom, p= 0.0243
Note: The logrank p-values in the paper are incorrect, the one’s I’ve computed are the right ones. In the paper, the death days from the dead patients were not correctly combined into the computation, so p-values were computed without the dead patients.
Immune activity
We’ll now stratify by cluster
, a variable that indicates whether or not we’re above median in terms of Cytotoxicity. In other words, 1 means we’re high immune, and 0 means we’re low immune.
simple_survival_analysis(SurvObj ~ cluster, data = tcga_clinical_labeled)
Call:
survival::survdiff(formula = formula, data = data)
n=562, 15 observations deleted due to missingness.
N Observed Expected (O-E)^2/E (O-E)^2/V
cluster=0 280 162 148 1.38 2.79
cluster=1 282 132 146 1.39 2.79
Chisq= 2.8 on 1 degrees of freedom, p= 0.095
Low immune tumours trend towards poor survival but this is not significant at all.
Now let’s try by treating immune activity as a continuous variable, controlling for other covariates:
tcga_immune_continuous <- tcga_celltype_matrix %>% t %>% as.data.frame %>% rownames_to_column("tcga_sample_id")
tcga_clinical_labeled_continuous <- plyr::join(tcga_clinical_labeled, tcga_immune_continuous)
coxph(SurvObj ~ Cytotoxicity + age_at_initial_pathologic_diagnosis + tumor_grade +
str_extract(tumor_stage, "[IV]+") + additional_chemo_therapy + additional_drug_therapy +
additional_immuno_therapy, tcga_clinical_labeled_continuous)
Call:
coxph(formula = SurvObj ~ Cytotoxicity + age_at_initial_pathologic_diagnosis +
tumor_grade + str_extract(tumor_stage, "[IV]+") + additional_chemo_therapy +
additional_drug_therapy + additional_immuno_therapy, data = tcga_clinical_labeled_continuous)
coef exp(coef) se(coef)
Cytotoxicity -1.68e-01 8.45e-01 8.25e-02
age_at_initial_pathologic_diagnosis 1.95e-02 1.02e+00 5.50e-03
tumor_gradeG1 -4.72e-01 6.24e-01 1.27e+00
tumor_gradeG2 -2.99e-01 7.42e-01 1.06e+00
tumor_gradeG3 -1.19e-02 9.88e-01 1.06e+00
tumor_gradeG4 3.76e-01 1.46e+00 1.45e+00
tumor_gradeGB 1.14e-02 1.01e+00 1.45e+00
tumor_gradeGX 3.20e-01 1.38e+00 1.16e+00
str_extract(tumor_stage, "[IV]+")II 1.12e-01 1.12e+00 7.00e-01
str_extract(tumor_stage, "[IV]+")III 9.32e-01 2.54e+00 5.91e-01
str_extract(tumor_stage, "[IV]+")IV 1.19e+00 3.29e+00 6.07e-01
additional_chemo_therapy[Not Available] -9.30e-03 9.91e-01 3.53e+03
additional_chemo_therapyNO -1.14e+00 3.19e-01 3.53e+03
additional_chemo_therapyYES -9.73e-01 3.78e-01 3.53e+03
additional_drug_therapy[Not Available] -1.49e+01 3.44e-07 2.42e+03
additional_drug_therapy[Pending] 5.16e-01 1.68e+00 3.53e+03
additional_drug_therapyNO 4.78e-01 1.61e+00 3.53e+03
additional_drug_therapyYES -1.28e-02 9.87e-01 3.53e+03
additional_immuno_therapy[Not Available] 1.44e+01 1.87e+06 2.58e+03
additional_immuno_therapy[Pending] -1.40e+01 8.52e-07 4.19e+03
additional_immuno_therapyNO -1.95e-01 8.23e-01 3.30e-01
additional_immuno_therapyYES NA NA 0.00e+00
z p
Cytotoxicity -2.04 0.04158
age_at_initial_pathologic_diagnosis 3.55 0.00038
tumor_gradeG1 -0.37 0.71118
tumor_gradeG2 -0.28 0.77908
tumor_gradeG3 -0.01 0.99097
tumor_gradeG4 0.26 0.79603
tumor_gradeGB 0.01 0.99372
tumor_gradeGX 0.28 0.78225
str_extract(tumor_stage, "[IV]+")II 0.16 0.87332
str_extract(tumor_stage, "[IV]+")III 1.58 0.11449
str_extract(tumor_stage, "[IV]+")IV 1.96 0.04961
additional_chemo_therapy[Not Available] 0.00 1.00000
additional_chemo_therapyNO 0.00 0.99974
additional_chemo_therapyYES 0.00 0.99978
additional_drug_therapy[Not Available] -0.01 0.99509
additional_drug_therapy[Pending] 0.00 0.99988
additional_drug_therapyNO 0.00 0.99989
additional_drug_therapyYES 0.00 1.00000
additional_immuno_therapy[Not Available] 0.01 0.99553
additional_immuno_therapy[Pending] 0.00 0.99734
additional_immuno_therapyNO -0.59 0.55541
additional_immuno_therapyYES NA NA
Likelihood ratio test=46.8 on 21 df, p=0.00101
n= 559, number of events= 292
(18 observations deleted due to missingness)
Indeed, immune activity is significantly associated with prolonged survival (negative coefficient = fewer death events).
Stratification by foldback-AMP status
fbi_high <- subset(tcga_clinical_labeled, Subgroup == "FBI-AMP High")
fbi_low <- subset(tcga_clinical_labeled, Subgroup == "FBI-AMP Low")
fbi_no <- subset(tcga_clinical_labeled, Subgroup == "No AMP")
fbi_nothigh <- subset(tcga_clinical_labeled, Subgroup %in% c("FBI-AMP Low",
"No AMP"))
simple_survival_analysis(SurvObj ~ cluster, data = fbi_high)
Call:
survival::survdiff(formula = formula, data = data)
n=172, 2 observations deleted due to missingness.
N Observed Expected (O-E)^2/E (O-E)^2/V
cluster=0 90 45 48.9 0.310 0.739
cluster=1 82 40 36.1 0.419 0.739
Chisq= 0.7 on 1 degrees of freedom, p= 0.39
simple_survival_analysis(SurvObj ~ cluster, data = fbi_low)
Call:
survival::survdiff(formula = formula, data = data)
n=181, 5 observations deleted due to missingness.
N Observed Expected (O-E)^2/E (O-E)^2/V
cluster=0 91 42 43.5 0.0514 0.116
cluster=1 90 37 35.5 0.0630 0.116
Chisq= 0.1 on 1 degrees of freedom, p= 0.733
simple_survival_analysis(SurvObj ~ cluster, data = fbi_no)
Call:
survival::survdiff(formula = formula, data = data)
N Observed Expected (O-E)^2/E (O-E)^2/V
cluster=0 34 24 18.2 1.84 3.23
cluster=1 41 19 24.8 1.35 3.23
Chisq= 3.2 on 1 degrees of freedom, p= 0.0724
What’s curious about this is that it seems only the no AMP group gets any benefit from having a high immune response. So there’s no benefit within the foldback group of high/low immune response.
simple_survival_analysis(SurvObj ~ cluster, data = fbi_nothigh)
Call:
survival::survdiff(formula = formula, data = data)
n=256, 5 observations deleted due to missingness.
N Observed Expected (O-E)^2/E (O-E)^2/V
cluster=0 125 66 61.9 0.275 0.559
cluster=1 131 56 60.1 0.283 0.559
Chisq= 0.6 on 1 degrees of freedom, p= 0.455
Stratification by immune activity
immune_low <- subset(tcga_clinical_labeled, cluster == 0)
immune_high <- subset(tcga_clinical_labeled, cluster == 1)
simple_survival_analysis(SurvObj ~ Subgroup, data = immune_high)
Call:
survival::survdiff(formula = formula, data = data)
n=213, 72 observations deleted due to missingness.
N Observed Expected (O-E)^2/E (O-E)^2/V
Subgroup=FBI-AMP High 82 40 28.1 5.005 7.303
Subgroup=FBI-AMP Low 90 37 40.5 0.299 0.531
Subgroup=No AMP 41 19 27.4 2.569 3.935
Chisq= 8.3 on 2 degrees of freedom, p= 0.0155
simple_survival_analysis(SurvObj ~ Subgroup, data = immune_low)
Call:
survival::survdiff(formula = formula, data = data)
n=215, 70 observations deleted due to missingness.
N Observed Expected (O-E)^2/E (O-E)^2/V
Subgroup=FBI-AMP High 90 45 38.6 1.060 1.651
Subgroup=FBI-AMP Low 91 42 50.8 1.535 2.843
Subgroup=No AMP 34 24 21.6 0.275 0.345
Chisq= 2.9 on 2 degrees of freedom, p= 0.236
Likewise, the only benefit of being non-foldback is derived from the immune-high group – there’s no difference between being foldback or not if you’re in the low immune group.
Multivariate models
simple_survival_analysis(SurvObj ~ Subgroup + cluster, data = tcga_clinical_labeled)
Call:
survival::survdiff(formula = formula, data = data)
n=428, 149 observations deleted due to missingness.
N Observed Expected (O-E)^2/E (O-E)^2/V
Subgroup=FBI-AMP High, cluster=0 90 45 38.4 1.118 1.380
Subgroup=FBI-AMP High, cluster=1 82 40 28.5 4.624 5.436
Subgroup=FBI-AMP Low, cluster=0 91 42 50.1 1.307 1.730
Subgroup=FBI-AMP Low, cluster=1 90 37 41.1 0.415 0.523
Subgroup=No AMP, cluster=0 34 24 21.0 0.444 0.496
Subgroup=No AMP, cluster=1 41 19 27.9 2.819 3.331
Chisq= 10.9 on 5 degrees of freedom, p= 0.0525
We’ll next combined FBI-AMP low and No AMP into a single category called FBI-NotHigh.
tcga_clinical_labeled$newgroup <- ifelse(tcga_clinical_labeled$Subgroup == "FBI-AMP High",
"FBI-High", "FBI-NotHigh")
simple_survival_analysis(SurvObj ~ newgroup + cluster, data = tcga_clinical_labeled)
Call:
survival::survdiff(formula = formula, data = data)
n=428, 149 observations deleted due to missingness.
N Observed Expected (O-E)^2/E (O-E)^2/V
newgroup=FBI-High, cluster=0 90 45 38.4 1.118 1.380
newgroup=FBI-High, cluster=1 82 40 28.5 4.624 5.436
newgroup=FBI-NotHigh, cluster=0 125 66 71.0 0.358 0.549
newgroup=FBI-NotHigh, cluster=1 131 56 69.0 2.448 3.686
Chisq= 8.7 on 3 degrees of freedom, p= 0.0337
Let’s also do Cox proportional hazards models:
mod1 <- coxph(SurvObj ~ Cytotoxicity + Subgroup + age_at_initial_pathologic_diagnosis +
tumor_grade + str_extract(tumor_stage, "[IV]+") + additional_chemo_therapy +
additional_drug_therapy + additional_immuno_therapy, tcga_clinical_labeled_continuous)
anova(mod1)
mod2 <- coxph(SurvObj ~ Cytotoxicity + Subgroup + age_at_initial_pathologic_diagnosis,
tcga_clinical_labeled_continuous)
anova(mod2)
mod3 <- coxph(SurvObj ~ Cytotoxicity + (Subgroup == "FBI-AMP High") + age_at_initial_pathologic_diagnosis,
subset(tcga_clinical_labeled_continuous, !is.na(Subgroup)))
anova(mod3)
So the effects of immune activity and FBI/HRD status are collinear.
Final run
Instead of overwriting the previous, this is its own section since the underlying implementation has changed substantially.
variant_table <- read_variant_file(master_variant_file, db_path)
breakpoint_table <- read_variant_file(master_breakpoint_file, db_path)
sig_results <- read_signature_files(mmctm_final_patient_dir, variant_table,
breakpoint_table, db_path)
sig_results_snv <- subset(sig_results$snv, is_present == 1)
sig_results_sv <- subset(sig_results$sv, is_present == 1)
sig_results_nonith <- sig_results$nonith %>% as.data.frame %>% column_to_rownames("signature") %>%
t %>% as.data.frame %>% rownames_to_column("condensed_id")
signature_labels <- str_extract(c(colnames(sig_results_snv), colnames(sig_results_sv)),
"SN?V-[0-9]+")
signature_labels <- signature_labels[!is.na(signature_labels)]
summarize_signature_proportions <- function(x, by = c("condensed_id", "patient_id"),
signature_labels, report_count = FALSE) {
props <- subset(x, select = c(by, colnames(x)[colnames(x) %in% signature_labels])) %>%
group_by_(.dots = by) %>% summarise_each(funs(sum))
if (report_count) {
counts <- subset(x, select = c(by, colnames(x)[colnames(x) %in% signature_labels])) %>%
group_by_(.dots = by) %>% summarise(n = n())
props <- plyr::join(props %>% as.data.frame, counts %>% as.data.frame)
}
sums <- rowSums(subset(props, select = colnames(props[colnames(props) %in%
signature_labels])))
sigs <- intersect(signature_labels, colnames(props))
for (sig in sigs) {
props[, sig] <- props[, sig]/sums
}
return(props)
}
Sample signature proportions
These proportions won’t be exactly equal to the topic proportions that the model outputs (they are variational estimates), but we’re actually doing pretty darn well.
TODO: Make a QC plot. From a glance it seems that proportion error is usually within 1-5% relative error.
sample_props_snv <- summarize_signature_proportions(sig_results_snv, by = c("condensed_id",
"patient_id"), signature_labels)
sample_props_sv <- summarize_signature_proportions(sig_results_sv, by = c("condensed_id",
"patient_id"), signature_labels)
patient_props_snv <- sig_results_snv %>% subset(select = c("patient_id", "chrom",
"coord", "ref", "alt", intersect(signature_labels, colnames(sig_results_snv)))) %>%
unique %>% summarize_signature_proportions(by = c("patient_id"), signature_labels)
patient_props_sv <- sig_results_sv %>% subset(select = c("patient_id", "prediction_id",
intersect(signature_labels, colnames(sig_results_sv)))) %>% unique %>% summarize_signature_proportions(by = c("patient_id"),
signature_labels)
sample_props <- plyr::join(sample_props_snv %>% as.data.frame, sample_props_sv %>%
as.data.frame)
sample_props <- rbind.fill(sample_props, sig_results_nonith)
sample_props_mat <- sample_props %>% column_to_rownames("condensed_id") %>%
subset(select = c(colnames(sample_props)[colnames(sample_props) %in% signature_labels]))
sample_props_mat_scaled <- scale(sample_props_mat)
# [,c('SNV-2', 'SNV-5', 'SV-3', 'SV-6', 'SV-8', 'SV-1', 'SV-7', 'SV-5',
# 'SNV-3')]
sample_props_heat <- pheatmap(sample_props_mat_scaled, fontsize_row = 5, clustering_method = "ward.D2")
patient_props <- plyr::join(patient_props_snv %>% as.data.frame, patient_props_sv %>%
as.data.frame)
patient_props <- rbind.fill(patient_props, sig_results_nonith %>% plyr::rename(c(condensed_id = "patient_id")))
patient_props_mat <- patient_props %>% column_to_rownames("patient_id") %>%
subset(select = c(colnames(patient_props)[colnames(patient_props) %in% signature_labels]))
patient_props_mat_scaled <- scale(patient_props_mat)
# [,c('SNV-2', 'SNV-5', 'SV-3', 'SV-6', 'SV-8', 'SV-1', 'SV-7', 'SV-5',
# 'SNV-3')]
patient_props_heat <- pheatmap(patient_props_mat_scaled, fontsize_row = 5, clustering_method = "ward.D2",
clustering_distance_rows = "correlation")
ITH cohort
sample_clusts <- as.data.frame(cutree(sample_props_heat$tree_row, 4)) %>% setNames(c("cluster")) %>%
rownames_to_column("condensed_id")
ihc_stabilize_subset <- stabilize_ihc_variances(ihc_table_slide, ihc_table,
tiltypes)
data_matrices <- list(sample_clusts, ihc_stabilize_subset, xcr_diversity, celltype_df,
pathway_df, proportion_subclonality_subset)
data_matrices <- lapply(data_matrices, function(x) {
x %>% dplyr::select(-one_of("patient_id"))
})
combined_df <- Reduce(function(x, y) plyr::join(x, y, type = "full"), data_matrices)
combined_df <- subset(combined_df, !is.na(cluster) & !condensed_id %in% c("7_BrnM"))
combined_df <- subset(combined_df, condensed_id %in% subset(samples, project_code ==
"ITH")$condensed_id)
combined_mat <- subset(combined_df, select = -c(condensed_id, cluster))
rownames(combined_mat) <- combined_df$condensed_id
# ref_dendrogram <- prune(as.dendrogram(sample_heat$tree_row), '7_BrnM')
sample_order <- sample_props_heat$tree_row$labels[sample_props_heat$tree_row$order]
sample_order <- intersect(sample_order, rownames(combined_mat))
cluster_annotations <- subset(combined_df, select = c(condensed_id, cluster))
SIGS <- c("SV-3", "SNV-5", "SV-6", "SV-8", "SNV-2")
sig_annotations <- rownames_to_column(data.frame(sample_props_mat[, SIGS], check.names = FALSE),
var = "condensed_id")
## Take logarithms to ~variance stabilize
variant_count_annotations <- variant_sample %>% group_by_(.dots = "condensed_id") %>%
summarise(snv_count = log(sum(snv_count)), bkpt_count = log(sum(bkpt_count))) %>%
subset(select = c("condensed_id", "snv_count", "bkpt_count"))
row_annotations <- Reduce(plyr::join, list(cluster_annotations, molsubtypes,
sig_annotations, variant_count_annotations))
row_annotations <- row_annotations %>% column_to_rownames(var = "condensed_id")
row_annotations_ith <- row_annotations
combined_mat_scaled <- scale(combined_mat)
pheatmap(clip_values(combined_mat_scaled, 2, -2)[sample_order, ], cluster_rows = FALSE,
cluster_cols = TRUE, annotation_row = row_annotations, fontsize = 6)
combined_mat_scaled_ith <- combined_mat_scaled
ICGC validation
icgc_fbi_status <- fread(wang_icgc_fbi_status_file)
colnames(icgc_fbi_status) <- mapvalues(colnames(icgc_fbi_status), from = c("Case_ID"),
to = c("condensed_id"))
colnames(icgc_immune) <- mapvalues(colnames(icgc_immune), "new_id", "condensed_id")
data_matrices <- list(sample_clusts, icgc_immune)
data_matrices <- lapply(data_matrices, function(x) {
x %>% dplyr::select(-one_of("patient_id"))
})
combined_df <- Reduce(function(x, y) plyr::join(x, y, type = "inner"), data_matrices)
# combined_df <- subset(combined_df, cluster == 1)
combined_mat <- subset(combined_df, select = -c(condensed_id, cluster))
rownames(combined_mat) <- combined_df$condensed_id
sample_order <- sample_props_heat$tree_row$labels[sample_props_heat$tree_row$order]
sample_order <- intersect(sample_order, rownames(combined_mat))
cluster_annotations <- subset(combined_df, select = c(condensed_id, cluster))
SIGS <- c("SV-3", "SNV-5", "SV-6", "SV-8", "SNV-2")
sig_annotations <- rownames_to_column(data.frame(sample_props_mat[, SIGS], check.names = FALSE),
var = "condensed_id")
subtype_annotations <- subset(icgc_subtypes, select = c("icgc_donor_id", "subtype",
"nmf_subtype"))
colnames(subtype_annotations)[1] <- "condensed_id"
icgc_fbi_annotations <- subset(icgc_fbi_status, select = c("condensed_id", "Patch et al. Class",
"Patch et al. Molecular Subgroup", "BRCA.status", "Subgroup"))
row_annotations <- Reduce(plyr::join, list(cluster_annotations, sig_annotations,
subtype_annotations, icgc_fbi_annotations))
row_annotations <- row_annotations %>% column_to_rownames(var = "condensed_id")
row_annotations_noith <- row_annotations
# select_rows <- rownames(subset(row_annotations, `SV-4` < 1))
notx <- subset(specimen_data, str_detect(specimen_type, "Primary") & specimen_donor_treatment_type ==
"no treatment")$icgc_donor_id
# sample_order <- intersect(sample_order, select_rows)
sample_order <- intersect(sample_order, notx)
# order_fbi <- order(row_annotations[rownames(combined_mat_scaled),]$`SV-8`)
# combined_mat_scaled <- scale(combined_mat)
combined_mat_scaled <- as.data.frame(apply(combined_mat, 2, function(x) (x -
median(x, na.rm = TRUE))/mad(x, na.rm = TRUE)))
combined_mat_scaled <- combined_mat_scaled[sample_order, ]
other_mat <- sample_props_mat_scaled[sample_order, ]
pheatmap(clip_values(cbind(combined_mat_scaled, other_mat), 2, -2), cluster_rows = FALSE,
cluster_cols = TRUE, annotation_row = row_annotations, fontsize = 6, clustering_distance_cols = "correlation",
clustering_method = "ward.D2")
combined_mat_scaled_noith <- combined_mat_scaled
icgc_clinical_labeled <- Reduce(function(x, y) plyr::join(x, y, type = "full"),
list(setNames(cluster_annotations, c("icgc_donor_id", "cluster")), icgc_clinical,
setNames(subset(icgc_fbi_annotations, select = c("condensed_id", "BRCA.status",
"Subgroup")), c("icgc_donor_id", "BRCA.status", "Wang_subgroup"))))
icgc_clinical_labeled$SurvObj <- with(icgc_clinical_labeled, Surv(donor_survival_time,
donor_vital_status == "deceased"))
Survival by FBI status …
simple_survival_analysis(SurvObj ~ cluster != 1, data = subset(icgc_clinical_labeled,
cluster != 4))
Call:
survival::survdiff(formula = formula, data = data)
N Observed Expected (O-E)^2/E (O-E)^2/V
cluster != 1=FALSE 28 26 28.9 0.298 0.627
cluster != 1=TRUE 36 32 29.1 0.297 0.627
Chisq= 0.6 on 1 degrees of freedom, p= 0.428
simple_survival_analysis(SurvObj ~ cluster, data = subset(icgc_clinical_labeled,
cluster != 4))
Call:
survival::survdiff(formula = formula, data = data)
N Observed Expected (O-E)^2/E (O-E)^2/V
cluster=1 28 26 28.94 0.298 0.627
cluster=2 25 23 21.32 0.133 0.223
cluster=3 11 9 7.75 0.203 0.249
Chisq= 0.7 on 2 degrees of freedom, p= 0.716
simple_survival_analysis(SurvObj ~ Wang_subgroup, data = icgc_clinical_labeled)
Call:
survival::survdiff(formula = formula, data = data)
n=66, 19 observations deleted due to missingness.
N Observed Expected (O-E)^2/E (O-E)^2/V
Wang_subgroup=High FBI 36 35 28.6 1.45 2.89
Wang_subgroup=Low FBI 30 25 31.4 1.32 2.89
Chisq= 2.9 on 1 degrees of freedom, p= 0.0889
Combined
Sample-level
combined_mat_scaled_all <- rbind.fill(combined_mat_scaled_ith %>% as.data.frame %>%
rownames_to_column("condensed_id"), combined_mat_scaled_noith %>% as.data.frame %>%
rownames_to_column("condensed_id")) %>% column_to_rownames("condensed_id")
row_annotations_all <- rbind.fill(row_annotations_ith %>% rownames_to_column("condensed_id"),
row_annotations_noith %>% rownames_to_column("condensed_id")) %>% column_to_rownames("condensed_id")
sample_order <- sample_props_heat$tree_row$labels[sample_props_heat$tree_row$order]
sample_order <- intersect(sample_order, rownames(combined_mat_scaled_all))
# combined_mat_scaled <- as.data.frame(apply(combined_mat_all, 2,
# function(x) (x-median(x,na.rm=TRUE))/mad(x,na.rm=TRUE)))
combined_mat_scaled <- combined_mat_scaled_all[sample_order, ]
nanostring_vars <- rownames(icgc_celltype_matrix)
combined_mat_scaled_subset <- subset(combined_mat_scaled, select = nanostring_vars)
combined_mat_scaled_subset <- combined_mat_scaled_subset[!apply(combined_mat_scaled_subset,
1, function(x) all(is.na(x))), ]
pheatmap(clip_values(combined_mat_scaled_subset, 2, -2), cluster_rows = FALSE,
cluster_cols = TRUE, annotation_row = row_annotations_all, fontsize = 6,
clustering_distance_cols = "correlation", clustering_method = "ward.D2")
Doesn’t really cluster consistently between ICGC and our samples.
TODO: Batch correct ICGC and our data together (I think I already did this somewhere) – and normalize the final matrix in one step rather than normalizing separately for each subcohort and combining those together. Otherwise we could always be subject to the scenario where the ITH cohort may be skewed in immune response (either all relatively low or high) compared to the ICGC cohort.
TODO: Use absolute counts of mutations for each signature, and cluster based on those. It may be that samples with too low/high mutation counts are not clustering properly.
Patient-level
# ith_icgc_batch_corrected_expression_file <-
# '~/shahlab/projects/ITH_Immune/paper/results/tables/run2/ith_icgc_merged_bc.tsv'
ith_icgc_expression <- fread(ith_icgc_batch_corrected_expression_file)
ith_icgc_celltype_expr <- create_celltype_matrix(ith_icgc_expression, labels,
db_path, convert_ids = FALSE)
ith_icgc_pathway_expr <- create_pathway_matrix(ith_icgc_expression, labels,
db_path, convert_ids = FALSE)
icgc_specimen_data <- fread(icgc_specimen_file)
summarize_expression_by_patient <- function(expr) {
primary_specimen_dat <- subset(icgc_specimen_data, str_detect(specimen_type,
"Primary"))
notx <- subset(primary_specimen_dat, str_detect(specimen_type, "Primary") &
specimen_donor_treatment_type == "no treatment")$icgc_donor_id
primary_specimen_dat <- subset(primary_specimen_dat, icgc_donor_id %in%
notx)
x <- setNames(melt(as.matrix(expr)), c("Name", "sample", "expr"))
x$patient_id <- map_id(as.character(x$sample), from = "condensed_id", to = "patient_id",
db_path)
idx <- is.na(x$patient_id)
donor_labels <- df_as_map(primary_specimen_dat, as.character(x$sample[idx]),
from = "icgc_specimen_id", to = "icgc_donor_id")
x$patient_id[idx] <- donor_labels
x <- subset(x, !is.na(patient_id))
x_sum <- x %>% group_by(Name, patient_id) %>% summarise(expr = mean(expr,
na.rm = TRUE))
res <- dcast(x_sum, formula = Name ~ patient_id, value.var = "expr")
return(res)
}
NCLUST <- 3
clusters <- cutree(patient_props_heat$tree_row, NCLUST)
patient_clusts <- make_cluster_frame(clusters)
cluster_annotations <- subset(plyr::rename(patient_clusts, c(new_id = "patient_id")),
select = c(patient_id, cluster))
ith_icgc_celltype_expr_patient <- summarize_expression_by_patient(ith_icgc_celltype_expr)
ith_icgc_pathway_expr_patient <- summarize_expression_by_patient(ith_icgc_pathway_expr)
expr_dat <- ith_icgc_pathway_expr_patient
combined_patient_mat <- expr_dat %>% as.data.frame %>% column_to_rownames(var = "Name") %>%
t %>% as.data.frame %>% rownames_to_column("patient_id")
data_matrices <- list(cluster_annotations, combined_patient_mat)
data_matrices <- lapply(data_matrices, function(x) {
x
})
combined_df <- Reduce(function(x, y) plyr::join(x, y, type = "inner"), data_matrices)
combined_mat <- subset(combined_df, select = -c(cluster))
rownames(combined_mat) <- NULL
combined_mat <- combined_mat %>% column_to_rownames("patient_id")
SIGS <- c("SV-3", "SNV-5", "SV-6", "SV-8", "SNV-2")
sig_annotations <- rownames_to_column(data.frame(patient_props_mat[, SIGS],
check.names = FALSE), var = "patient_id")
subtype_annotations <- subset(icgc_subtypes, select = c("icgc_donor_id", "subtype",
"nmf_subtype"))
colnames(subtype_annotations)[1] <- "patient_id"
icgc_fbi_status_patient <- fread(wang_icgc_fbi_status_file)
colnames(icgc_fbi_status_patient) <- mapvalues(colnames(icgc_fbi_status_patient),
from = c("Case_ID"), to = c("patient_id"))
icgc_fbi_annotations <- subset(icgc_fbi_status_patient, select = c("patient_id",
"Patch et al. Class", "Patch et al. Molecular Subgroup", "BRCA.status",
"Subgroup"))
row_annotations <- Reduce(plyr::join, list(cluster_annotations, sig_annotations,
subtype_annotations, icgc_fbi_annotations))
row_annotations <- row_annotations %>% column_to_rownames("patient_id")
combined_patient_mat_scaled <- combined_mat %>% scale
patient_order <- patient_props_heat$tree_row$labels[patient_props_heat$tree_row$order]
patient_order <- intersect(patient_order, rownames(combined_patient_mat_scaled))
pheatmap(clip_values(combined_patient_mat_scaled[patient_order, ], 2, -2), cluster_rows = FALSE,
cluster_cols = TRUE, annotation_row = row_annotations, fontsize = 6, clustering_distance_cols = "correlation",
clustering_method = "ward.D2")
Ancestral-descendant signature proportions
ancdesc_props_snv <- unique(subset(sig_results_snv, select = c("patient_id",
"is_ancestral", "chrom", "coord", "ref", "alt", signature_labels[signature_labels %in%
colnames(sig_results_snv)]))) %>% summarize_signature_proportions(by = c("patient_id",
"is_ancestral"), signature_labels)
ancdesc_props_sv <- unique(subset(sig_results_sv, select = c("patient_id", "prediction_id",
"is_ancestral", signature_labels[signature_labels %in% colnames(sig_results_sv)]))) %>%
summarize_signature_proportions(by = c("patient_id", "is_ancestral"), signature_labels)
ancdesc_props <- plyr::join(ancdesc_props_snv %>% as.data.frame, ancdesc_props_sv %>%
as.data.frame)
ancdesc_props_mat <- subset(ancdesc_props, select = colnames(ancdesc_props)[colnames(ancdesc_props) %in%
signature_labels])
ancdesc_props_scaled <- scale(ancdesc_props_mat)
rownames(ancdesc_props_scaled) <- with(ancdesc_props, paste(patient_id, is_ancestral,
sep = "_"))
pheatmap(ancdesc_props_scaled, clustering_distance_rows = "euclidean", clustering_method = "ward.D2")
ancdesc_props_melted <- melt(ancdesc_props, id.vars = c("patient_id", "is_ancestral"),
measure.vars = intersect(colnames(ancdesc_props), signature_labels), variable.name = "signature",
value.name = "proportion")
pvals <- setNames(ddply(ancdesc_props_melted, .(signature), function(x) {
df <- as.data.frame(x)
corres <- wilcox.test(proportion ~ is_ancestral, df, paired = TRUE)
pval <- corres$p.value
eq <- substitute(italic(P) == p, list(p = format(pval, digits = 3)))
return(as.character(as.expression(eq)))
}), c("signature", "p.value"))
ggplot(ancdesc_props_melted, aes(x = factor(is_ancestral), y = proportion)) +
geom_boxplot() + facet_wrap(~signature, scales = "free") + geom_text(data = pvals,
aes(x = Inf, y = Inf, label = p.value), hjust = 1.1, vjust = 1.5, size = 3,
parse = TRUE) + theme_bw() + theme_Publication()
Origin node signature proportions
Note: We cannot get node-specific rearrangement signatures.
Note: These node labels DO NOT correspond to node labels within the clone phylogeny; they correspond to nodes within the Dollo model.
node_props_snv <- unique(subset(sig_results_snv, select = c("patient_id", "origin_node",
"chrom", "coord", "ref", "alt", signature_labels[signature_labels %in% colnames(sig_results_snv)]))) %>%
summarize_signature_proportions(by = c("patient_id", "origin_node"), signature_labels,
report_count = TRUE)
node_props_mat <- subset(node_props_snv, select = colnames(node_props_snv)[colnames(node_props_snv) %in%
signature_labels])
node_props_scaled <- scale(node_props_mat)
rownames(node_props_scaled) <- with(node_props_snv, paste(patient_id, origin_node,
sep = "_"))
row_annotations <- subset(node_props_snv, select = c("patient_id", "n"))
row_annotations$n <- log2(row_annotations$n)
row_annotations$patient_id <- factor_id(row_annotations$patient_id, type = "patient_id",
db_path)
rownames(row_annotations) <- rownames(node_props_scaled)
clustering_colours <- list(patient_id = pal_patient)
pheatmap(clip_values(node_props_scaled, 2, -2), clustering_distance_rows = "euclidean",
clustering_method = "ward.D2", fontsize_row = 5, annotation_row = row_annotations,
annotation_colors = clustering_colours)
Looks like there are similar selection pressures acting at each part of the sample phylogeny – i.e. mutation signatures are ‘relatively’ consistent within patients throughout time.
Clonal phylogeny branch-specific signature proportions
NOTE: THE LABELS ON THESE TREES MAY NOT BE THE SAME AS THOSE IN THE MAPSCAPES (ugh).
tree_branch_data <- read_clone_tree_data(clone_tree_file, clone_branch_length_file,
clone_prevalence_file, db_path)
trees <- lapply(tree_branch_data, function(x) x$tree)
branch_lengths <- rbind.fill(lapply(tree_branch_data, function(x) x$branch_dat))
snv_cluster <- rbind.fill(lapply(seq_along(snv_cluster_files), function(i) {
f <- snv_cluster_files[[i]]
patient_id <- snv_cluster_patients[[i]]
snv_cluster <- read_snv_cluster(f, clone_branch_length_file, db_path)
return(snv_cluster)
}))
snv_cluster <- plyr::join(snv_cluster, branch_lengths)
snv_cluster_filtered <- subset(snv_cluster, !is.na(label))
sig_results_snv_cluster <- plyr::join(sig_results_snv, snv_cluster_filtered)
branch_props_snv <- unique(subset(sig_results_snv_cluster, select = c("patient_id",
"label", "chrom", "coord", "ref", "alt", signature_labels[signature_labels %in%
colnames(sig_results_snv_cluster)]))) %>% summarize_signature_proportions(by = c("patient_id",
"label"), signature_labels, report_count = TRUE)
branch_props_snv_melted <- melt(branch_props_snv, id.vars = c("patient_id",
"label", "n"), measure.vars = intersect(signature_labels, colnames(branch_props_snv)),
variable.name = "signature", value.name = "proportion")
ggplot(branch_props_snv_melted, aes(x = label, y = proportion)) + geom_bar(aes(fill = signature),
stat = "identity") + facet_wrap(~patient_id, ncol = 1, scales = "free_x") +
theme_bw() + theme_Publication() + geom_text(data = unique(subset(branch_props_snv_melted,
select = c("patient_id", "label", "n"))), aes(x = label, y = 1, label = n),
vjust = -0.2, stat = "identity") + ylim(c(0, 1.2))
What if we use MAP assignments? (This would allow us to apply chi-square tests.)
TODO: Figure out how to apply tests between k > 2 groups of proportions… in other words, a test of homogeneity.
id_vars <- colnames(sig_results_snv_cluster)[!colnames(sig_results_snv_cluster) %in%
signature_labels]
sig_results_snv_cluster_melted <- melt(sig_results_snv_cluster, id.vars = id_vars,
measure.vars = intersect(signature_labels, colnames(sig_results_snv_cluster)),
variable.name = "signature", value.name = "proportion")
maxes <- sig_results_snv_cluster_melted %>% group_by_(.dots = id_vars) %>% summarise(maxprop = max(proportion))
sig_results_snv_cluster_melted <- plyr::join(sig_results_snv_cluster_melted,
maxes)
sig_results_snv_cluster_melted$proportion_map <- ifelse(sig_results_snv_cluster_melted$proportion ==
sig_results_snv_cluster_melted$maxprop, 1, 0)
sig_results_snv_cluster_casted <- dcast(sig_results_snv_cluster_melted, formula = paste0(paste(id_vars,
collapse = "+"), "~ signature"), value.var = "proportion_map")
branch_props_snv_map <- unique(subset(sig_results_snv_cluster_casted, select = c("patient_id",
"label", "chrom", "coord", "ref", "alt", signature_labels[signature_labels %in%
colnames(sig_results_snv_cluster_casted)]))) %>% summarize_signature_proportions(by = c("patient_id",
"label"), signature_labels, report_count = TRUE)
branch_props_snv_map_melted <- melt(branch_props_snv_map, id.vars = c("patient_id",
"label", "n"), measure.vars = intersect(signature_labels, colnames(branch_props_snv_map)),
variable.name = "signature", value.name = "proportion")
ggplot(branch_props_snv_map_melted, aes(x = label, y = proportion)) + geom_bar(aes(fill = signature),
stat = "identity") + facet_wrap(~patient_id, ncol = 1, scales = "free_x") +
theme_bw() + theme_Publication() + geom_text(data = unique(subset(branch_props_snv_map_melted,
select = c("patient_id", "label", "n"))), aes(x = label, y = 1, label = n),
vjust = -0.2, stat = "identity") + ylim(c(0, 1.2))
Adjusted clone trees
trees_age <- trees
age_signature <- "SNV-5"
branch_props_dat <- branch_props_snv_map_melted
tree_objects <- lapply(seq_along(trees_age), function(i) {
tree <- trees_age[[i]]
tree_old <- tree
patient <- names(trees_age)[i]
branch_props_dat_sub <- subset(branch_props_dat, patient_id == as.numeric(patient) &
signature == age_signature)
branch_props_dat_sub$length <- with(branch_props_dat_sub, proportion * n)
edge_lengths <- tree@edge.length
idx <- which(edge_lengths == 0)
to_labels <- tree@label[str_extract(names(edge_lengths), "[0-9]+$")]
lengths <- df_as_map(branch_props_dat_sub, unname(to_labels), from = "label",
to = "length")
if (length(idx) > 0) {
lengths[idx] <- 0
}
tree@edge.length <- lengths
names(tree@edge.length) <- names(edge_lengths)
return(list(all = tree_old, age = tree))
})
names(tree_objects) <- names(trees_age)
tmp <- unlist(tree_objects)
# ignore <- lapply(seq_along(tmp), function(i) { print(plot(tmp[[i]],
# show.node.label = TRUE, main = names(tmp)[i])) })
find_ancestors <- function(tree) {
x <- phylobase:::.phylo4ToDataFrame(tree)
root <- subset(x, node.type == "root")$label
root_number <- names(which(tree@label == root))
direct_descendants <- subset(x, ancestor == as.numeric(root_number))$label
if (length(direct_descendants) == 1) {
return(c(root, direct_descendants))
} else {
return(root)
}
}
patients <- unique(branch_props_dat$patient_id)
root_data <- rbind.fill(lapply(patients, function(pat) {
tree <- trees[[as.character(pat)]]
ancestors <- find_ancestors(tree)
rbind.fill(lapply(ancestors, function(x) {
data.frame(label = x, patient_id = pat)
}))
}))
root_data$node_type <- "root"
branch_data_annotated <- plyr::join(branch_props_dat, root_data)
branch_data_annotated$node_type[is.na(branch_data_annotated$node_type)] <- "descendant"
root_proportions <- branch_data_annotated %>% subset(node_type == "root") %>%
group_by(patient_id, signature) %>% summarise(proportion = weighted.mean(proportion,
w = n)) %>% plyr::rename(c(proportion = "root_proportion"))
branch_data_annotated <- plyr::join(branch_data_annotated, root_proportions)
branch_data_annotated$proportion_diff <- with(branch_data_annotated, proportion -
root_proportion)
ggplot(branch_data_annotated %>% subset(node_type == "descendant" & n > 40),
aes(x = proportion_diff, fill = signature)) + geom_histogram(binwidth = 0.02,
alpha = 0.4, position = "identity") + theme_bw() + theme_Publication()
---
title: "Mutation signatures"
output: 
  html_notebook:
    toc: true
    toc_depth: 5
    toc_float: true
params:
  rmd: "mutation_signatures.Rmd"
---
                        ```{r, echo=FALSE, message=FALSE, warning=FALSE}

######## Snakemake header ########
library(methods)
Snakemake <- setClass(
    "Snakemake",
    slots = c(
        input = "list",
        output = "list",
        params = "list",
        wildcards = "list",
        threads = "numeric",
        log = "list",
        resources = "list",
        config = "list",
        rule = "character"
    )
)
snakemake <- Snakemake(
    input = list('/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/tree_data.tsv', '/shahlab/alzhang/data/TCGA/synapse_clinAll_data.tsv', '/shahlab/amcpherson/projects/ith3/ith3/notebooks/bespoke/ith_breakpoints.tsv', '/shahlab/alzhang/projects/ITH_Immune/results/mmctm_results/ith_by-ancestry-sample/output', '/shahlab/alzhang/projects/ITH_Immune/data/expression/nanostring/pancancer_annotations.tsv', '/shahlab/amcpherson/projects/ith3/ith3/notebooks/bespoke/ith_snvs.tsv', '/shahlab/alzhang/projects/ITH_Immune/results/mmctm_results/ith_by-patient_with-ov', '/shahlab/alzhang/projects/ITH_Immune/paper/results/web/resources/mmctm_final_patient_sigplot.png', '/shahlab/alzhang/data/ICGC/OVAU_expr_melted.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/web/resources/mmctm_sample_sigplot.png', '/shahlab/alzhang/data/ICGC/icgc_primary_tumour_subtypes.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/web/resources/mmctm_ov_combined_sigplot.png', '/shahlab/alzhang/projects/ITH_Immune/paper/results/web/resources/mmctm_sample_ad_sigplot.png', '/shahlab/alzhang/projects/ITH_Immune/results/mmctm_results/combined_ov_mmctm/output', '/shahlab/alzhang/projects/ITH_Immune/data/ith/complete/old_proportion_subclonal.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/snv_cluster/patient_1.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/snv_cluster/patient_2.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/snv_cluster/patient_3.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/snv_cluster/patient_4.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/snv_cluster/patient_7.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/snv_cluster/patient_9.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/snv_cluster/patient_10.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/snv_cluster/patient_11.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/snv_cluster/patient_12.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/snv_cluster/patient_13.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/snv_cluster/patient_14.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/snv_cluster/patient_15.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/snv_cluster/patient_16.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/snv_cluster/patient_17.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/ith_icgc_merged_bc.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/molsubtypes.tsv', '/shahlab/alzhang/pipeline_outputs/ith_immune/mixcr/mixcr_runs/ith_1_2_3/mixcr5/postprocess/TRB/postfilter_diversity_stats/diversity.strict.resampled.txt', 'Rmd/mutation_signatures.Rmd', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/ihc_table_slide.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/clone_data.tsv', '/shahlab/alzhang/projects/ITH_Immune/results/nanostring_results/ith_full/qc/limma_quantile/normalized_expression_voa_labels_filtered.tsv', '/shahlab/alzhang/projects/ITH_Immune/results/mmctm_results/ith_by-sample/output', '/shahlab/alzhang/data/ICGC/specimen.tsv', '/shahlab/alzhang/pipeline_outputs/ith_immune/mixcr/mixcr_runs/ith_1_2_3/mixcr5/postprocess/IGH/postfilter_diversity_stats/diversity.strict.resampled.txt', '/shahlab/alzhang/projects/ITH_Immune/paper/analysis/Rmd/_site.yml', '/shahlab/alzhang/data/ICGC/OVAU_expr_matrix.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/web/resources/mmctm_patient_ad_sigplot.png', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/branch_data.tsv', '/shahlab/alzhang/data/ICGC/ng.3849-S12.txt', '/shahlab/alzhang/data/TCGA/tcga_ov_annotation_sup13.txt', '/shahlab/alzhang/data/ICGC/donor.OV-AU.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/ihc_table.tsv', '/shahlab/alzhang/projects/ITH_Immune/results/mmctm_results/ith_by-patient-ancestry/output', '/shahlab/alzhang/data/TCGA/expr_matrix_normalize_standardize_noduplicates.tsv', "tcga_clinical" = '/shahlab/alzhang/data/TCGA/synapse_clinAll_data.tsv', "master_breakpoint_file" = '/shahlab/amcpherson/projects/ith3/ith3/notebooks/bespoke/ith_breakpoints.tsv', "mmctm_patient_ad_dir" = '/shahlab/alzhang/projects/ITH_Immune/results/mmctm_results/ith_by-patient-ancestry/output', "mmctm_sample_ad_dir" = '/shahlab/alzhang/projects/ITH_Immune/results/mmctm_results/ith_by-ancestry-sample/output', "nanostring_annotations" = '/shahlab/alzhang/projects/ITH_Immune/data/expression/nanostring/pancancer_annotations.tsv', "master_variant_file" = '/shahlab/amcpherson/projects/ith3/ith3/notebooks/bespoke/ith_snvs.tsv', "icgc_expr_melted" = '/shahlab/alzhang/data/ICGC/OVAU_expr_melted.tsv', "mmctm_final_patient_sigplot" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/web/resources/mmctm_final_patient_sigplot.png', "clone_tree_file" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/tree_data.tsv', "mmctm_sample_sigplot" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/web/resources/mmctm_sample_sigplot.png', "icgc_subtypes" = '/shahlab/alzhang/data/ICGC/icgc_primary_tumour_subtypes.tsv', "mmctm_ov_combined_sigplot" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/web/resources/mmctm_ov_combined_sigplot.png', "mmctm_sample_ad_sigplot" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/web/resources/mmctm_sample_ad_sigplot.png', "mmctm_ov_combined_dir" = '/shahlab/alzhang/projects/ITH_Immune/results/mmctm_results/combined_ov_mmctm/output', "subclonality" = '/shahlab/alzhang/projects/ITH_Immune/data/ith/complete/old_proportion_subclonal.tsv', "snv_cluster_files" = c('/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/snv_cluster/patient_1.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/snv_cluster/patient_2.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/snv_cluster/patient_3.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/snv_cluster/patient_4.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/snv_cluster/patient_7.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/snv_cluster/patient_9.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/snv_cluster/patient_10.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/snv_cluster/patient_11.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/snv_cluster/patient_12.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/snv_cluster/patient_13.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/snv_cluster/patient_14.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/snv_cluster/patient_15.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/snv_cluster/patient_16.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/snv_cluster/patient_17.tsv'), "ith_icgc_bc" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/ith_icgc_merged_bc.tsv', "molecular_subtypes" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/molsubtypes.tsv', "tcr_diversity" = '/shahlab/alzhang/pipeline_outputs/ith_immune/mixcr/mixcr_runs/ith_1_2_3/mixcr5/postprocess/TRB/postfilter_diversity_stats/diversity.strict.resampled.txt', "notebook" = 'Rmd/mutation_signatures.Rmd', "ihc_table_slide" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/ihc_table_slide.tsv', "clone_prevalence_file" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/clone_data.tsv', "nanostring_data" = '/shahlab/alzhang/projects/ITH_Immune/results/nanostring_results/ith_full/qc/limma_quantile/normalized_expression_voa_labels_filtered.tsv', "mmctm_sample_dir" = '/shahlab/alzhang/projects/ITH_Immune/results/mmctm_results/ith_by-sample/output', "icgc_specimen" = '/shahlab/alzhang/data/ICGC/specimen.tsv', "bcr_diversity" = '/shahlab/alzhang/pipeline_outputs/ith_immune/mixcr/mixcr_runs/ith_1_2_3/mixcr5/postprocess/IGH/postfilter_diversity_stats/diversity.strict.resampled.txt', "site_file" = '/shahlab/alzhang/projects/ITH_Immune/paper/analysis/Rmd/_site.yml', "icgc_expr_mat" = '/shahlab/alzhang/data/ICGC/OVAU_expr_matrix.tsv', "mmctm_patient_ad_sigplot" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/web/resources/mmctm_patient_ad_sigplot.png', "clone_branch_length_file" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/branch_data.tsv', "wang_fbi_status" = '/shahlab/alzhang/data/ICGC/ng.3849-S12.txt', "tcga_ov_annotation" = '/shahlab/alzhang/data/TCGA/tcga_ov_annotation_sup13.txt', "icgc_clinical" = '/shahlab/alzhang/data/ICGC/donor.OV-AU.tsv', "ihc_table" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/ihc_table.tsv', "mmctm_final_patient_dir" = '/shahlab/alzhang/projects/ITH_Immune/results/mmctm_results/ith_by-patient_with-ov', "tcga_expr_mat" = '/shahlab/alzhang/data/TCGA/expr_matrix_normalize_standardize_noduplicates.tsv'),
    output = list('/shahlab/alzhang/projects/ITH_Immune/paper/results/web/mutation_signatures.nb.html'),
    params = list(c('E_CD8_density', 'E_CD4_density', 'E_CD20_density', 'E_Plasma_density', 'S_CD8_density', 'S_CD4_density', 'S_CD20_density', 'S_Plasma_density', 'T_CD8_density', 'T_CD4_density', 'T_CD20_density', 'T_Plasma_density'), 'ithi-analysis-mutation-signature-notebook', '/shahlab/alzhang/projects/ITH_Immune/metadata/db/immune_project.sqlite3', c('1', '2', '3', '4', '7', '9', '10', '11', '12', '13', '14', '15', '16', '17'), "mutsig_tiltypes" = c('E_CD8_density', 'E_CD4_density', 'E_CD20_density', 'E_Plasma_density', 'S_CD8_density', 'S_CD4_density', 'S_CD20_density', 'S_Plasma_density', 'T_CD8_density', 'T_CD4_density', 'T_CD20_density', 'T_Plasma_density'), "name" = 'ithi-analysis-mutation-signature-notebook', "db" = '/shahlab/alzhang/projects/ITH_Immune/metadata/db/immune_project.sqlite3', "snv_cluster_patients" = c('1', '2', '3', '4', '7', '9', '10', '11', '12', '13', '14', '15', '16', '17')),
    wildcards = list(),
    threads = 1,
    log = list('/shahlab/alzhang/clusttmp/paperanalysis2/mutation_signatures.log'),
    resources = list(),
    config = list("notebook_dir" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/web', "nclusts" = 3, "table_dir" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2', "mmctm_ov_combined_sigplot" = '/shahlab/alzhang/projects/ITH_Immune/results/mmctm_results/combined_ov_mmctm/plots/ov_snv-sv_sigs_multipanel.pdf', "mvclust_nclust" = 3, "mutation_signature_notebook" = 'Rmd/mutation_signatures.Rmd', "example_msa_plot" = '/shahlab/alzhang/pipeline_outputs/ith_immune/igpartition/run13/old/alignment_plots/msa/ith2_2/clust9/indel_reversed.html', "default_sampler" = 'HMC', "mmctm_patient_ancestral_descendant_result_dir" = '/shahlab/alzhang/projects/ITH_Immune/results/mmctm_results/ith_by-patient-ancestry/output', "bcr_clonotypes" = '/shahlab/alzhang/pipeline_outputs/ith_immune/mixcr/mixcr_runs/ith_1_2_3/mixcr5/clonotypes/IGH_clonotypes_filtered.txt', "master_variant_file" = '/shahlab/amcpherson/projects/ith3/ith3/notebooks/bespoke/ith_snvs.tsv', "icgc_expr_melted" = '/shahlab/alzhang/data/ICGC/OVAU_expr_melted.tsv', "spatial_notebook" = 'Rmd/spatial_analysis.Rmd', "tcga_clinical" = '/shahlab/alzhang/data/TCGA/synapse_clinAll_data.tsv', "ith_stat_types" = c('entropy', 'postprocessed_divergence', 'combined_ith_normalized', 'proportion_subclonal'), "tils_for_cluster" = c('E_CD8_density', 'E_CD4_density', 'E_CD20_density', 'E_Plasma_density', 'S_CD8_density', 'S_CD4_density', 'S_CD20_density', 'S_Plasma_density'), "ith_statistics_notebook" = 'Rmd/ith_statistics.Rmd', "prevalence_threshold" = 0.01, "til_classifier_notebook" = 'Rmd/til_classifier.Rmd', "example_annotations" = '/shahlab/alzhang/pipeline_outputs/ith_immune/igpartition/run13/final_partitions/ith2_2/clust9/annotations_flagged.tsv', "variability_type" = 'stabilize', "immune_variability_notebook" = 'Rmd/immune_variability.Rmd', "immtyper_models" = '/shahlab/alzhang/projects/ITH_Immune/results/immtyper_results/klarenbeek/aa_vj/gradboost', "benchmarkdir" = '/shahlab/alzhang/benchmarks/paperanalysis2', "nanostring_data" = '/shahlab/alzhang/projects/ITH_Immune/results/nanostring_results/ith_full/qc/limma_quantile/normalized_expression_voa_labels_filtered.tsv', "xcr_mapping_notebook" = 'Rmd/xcr_mapping.Rmd', "ihc_run1" = '/shahlab/alzhang/projects/ITH_Immune/data/ihc/cd8cd3cd20/validated_stats_weighted_new.rdata', "phenotype_threshold" = 0.85, "neoediting_outdir" = '/shahlab/alzhang/pipeline_outputs/ith_immune/neoediting/run4', "tcga_ov_annotations" = '/shahlab/alzhang/data/TCGA/tcga_ov_annotation_sup13.txt', "logdir" = '/shahlab/alzhang/clusttmp/paperanalysis2', "xcr_distance_method" = 'horn', "db" = '/shahlab/alzhang/projects/ITH_Immune/metadata/db/immune_project.sqlite3', "icgc_molecular_subtypes" = '/shahlab/alzhang/data/ICGC/icgc_primary_tumour_subtypes.tsv', "nanostring_signature_notebook" = 'Rmd/nanostring_signatures.Rmd', "intermediate_dir" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/intermediates/run2', "PNG_DENSITY" = 300, "bcrphylo_correlations_notebook" = 'Rmd/bcr_phylo_correlations.Rmd', "mmctm_final_patient_dir" = '/shahlab/alzhang/projects/ITH_Immune/results/mmctm_results/ith_by-patient_with-ov', "mmctm_sample_sigplot" = '/shahlab/alzhang/projects/ITH_Immune/results/mmctm_results/ith_by-sample/plots/ith-by-sample_snv-sv_sigs_multipanel.pdf', "ith_til_notebook" = 'Rmd/ith_til_densities.Rmd', "immtyper_lengths" = '11 12 13 14 15 16 17 18', "xcr_clones_notebook" = 'Rmd/xcr_clones_analysis.Rmd', "neoantigen_editing_notebook" = 'Rmd/immunoediting.Rmd', "library_sizes" = '/shahlab/alzhang/pipeline_outputs/ith_immune/mixcr/mixcr_runs/ith_1_2_3/mixcr5/library_sizes.tsv', "igpartition_outdir" = '/shahlab/alzhang/pipeline_outputs/ith_immune/igpartition/run22', "driver_analysis_notebook" = 'Rmd/driver_analysis.Rmd', "tcr_diversity" = '/shahlab/alzhang/pipeline_outputs/ith_immune/mixcr/mixcr_runs/ith_1_2_3/mixcr5/postprocess/TRB/postfilter_diversity_stats/diversity.strict.resampled.txt', "v_dictionary" = '/shahlab/alzhang/projects/ITH_Immune/subprojects/immtyper/metadata/imgt/Homo_sapiens_TRBV.fasta', "clone_prevalence_file" = '/shahlab/alzhang/projects/ITH_Immune/data/ith/complete/clone_data.tsv', "ihc_xcr_stats_notebook" = 'Rmd/ihc_xcr_stats.Rmd', "mmctm_sample_result_dir" = '/shahlab/alzhang/projects/ITH_Immune/results/mmctm_results/ith_by-sample/output', "mmctm_ancestral_descendant_result_dir" = '/shahlab/alzhang/projects/ITH_Immune/results/mmctm_results/ith_by-ancestry-sample/output', "multiviewclustering_notebook" = 'Rmd/multiviewclustering.Rmd', "clonal_samplers" = c('HMC', 'NUTS'), "ith_project_results" = '/shahlab/alzhang/projects/ith3/data/results', "master_breakpoint_file" = '/shahlab/amcpherson/projects/ith3/ith3/notebooks/bespoke/ith_breakpoints.tsv', "bcrphylo_tiltypes" = c('E_CD8_density', 'E_CD4_density', 'E_CD20_density', 'E_Plasma_density', 'S_CD8_density', 'S_CD4_density', 'S_CD20_density', 'S_Plasma_density', 'T_CD8_density', 'T_CD4_density', 'T_CD20_density', 'T_Plasma_density'), "bcr_diversity" = '/shahlab/alzhang/pipeline_outputs/ith_immune/mixcr/mixcr_runs/ith_1_2_3/mixcr5/postprocess/IGH/postfilter_diversity_stats/diversity.strict.resampled.txt', "PNG_QUALITY" = 300, "clone_tree_file" = '/shahlab/alzhang/projects/ITH_Immune/data/ith/complete/tree_data.tsv', "mmctm_final_patient_sigplot" = '/shahlab/alzhang/projects/ITH_Immune/results/mmctm_results/ith_by-patient_with-ov/plots/ith-by-patient_snv-sv_sigs_multipanel.pdf', "site_file" = '/shahlab/alzhang/projects/ITH_Immune/paper/analysis/Rmd/_site.yml', "patients_for_clonal" = c(1, 2, 3, 4, 7, 9, 10, 11, 12, 13, 14, 15, 16, 17), "mmctm_sample_ad_sigplot" = '/shahlab/alzhang/projects/ITH_Immune/results/mmctm_results/ith_by-ancestry-sample/plots/ith-by-ancestral-sample_snv-sv_sigs_multipanel.pdf', "sad_notebook" = 'Rmd/species_abundance_distributions.Rmd', "xcr_qc_notebook" = 'Rmd/replicates.Rmd', "mvclust_tiltypes" = c('E_CD8_density', 'E_CD4_density', 'E_CD20_density', 'E_Plasma_density', 'S_CD8_density', 'S_CD4_density', 'S_CD20_density', 'S_Plasma_density'), "mutsig_tiltypes" = c('E_CD8_density', 'E_CD4_density', 'E_CD20_density', 'E_Plasma_density', 'S_CD8_density', 'S_CD4_density', 'S_CD20_density', 'S_Plasma_density', 'T_CD8_density', 'T_CD4_density', 'T_CD20_density', 'T_Plasma_density'), "icgc_normalized_reads_matrix" = '/shahlab/alzhang/data/ICGC/OVAU_expr_matrix.tsv', "xcrmapscape_notebook" = 'Rmd/xcrmapscape.Rmd', "known_subtype_file" = '/shahlab/alzhang/projects/ITH_Immune/data/expression/array/subtypes/known_subtypes.tsv', "tcr_clonotypes" = '/shahlab/alzhang/pipeline_outputs/ith_immune/mixcr/mixcr_runs/ith_1_2_3/mixcr5/clonotypes/TRB_clonotypes_filtered.txt', "classifier_type" = 'knn', "tcga_expr_matrix" = '/shahlab/alzhang/data/TCGA/expr_matrix_normalize_standardize_noduplicates.tsv', "figure_gallery_notebook" = 'Rmd/figures.Rmd', "clone_branch_length_file" = '/shahlab/alzhang/projects/ITH_Immune/data/ith/complete/branch_data.tsv', "wang_fbi_status" = '/shahlab/alzhang/data/ICGC/ng.3849-S12.txt', "icgc_clinical" = '/shahlab/alzhang/data/ICGC/donor.OV-AU.tsv', "ihc_run2" = '/shahlab/alzhang/projects/ITH_Immune/data/ihc/cd79cd138cd68/validated_stats_weighted.rdata', "nanostring_annotations" = '/shahlab/alzhang/projects/ITH_Immune/data/expression/nanostring/pancancer_annotations.tsv', "icgc_specimen_file" = '/shahlab/alzhang/data/ICGC/specimen.tsv', "proportion_subclonal_file" = '/shahlab/alzhang/projects/ITH_Immune/data/ith/complete/old_proportion_subclonal.tsv', "molsubtype_notebook" = 'Rmd/molecular_subtypes.Rmd', "driver_map" = '/shahlab/alzhang/projects/ITH_Immune/subprojects/drivers/data/gene_list_mapped.bed', "j_dictionary" = '/shahlab/alzhang/projects/ITH_Immune/subprojects/immtyper/metadata/imgt/Homo_sapiens_TRBJ.fasta', "ihc_xcr_tiltypes" = c('E_CD8_density', 'E_CD4_density', 'E_CD20_density', 'E_Plasma_density', 'S_CD8_density', 'S_CD4_density', 'S_CD20_density', 'S_Plasma_density', 'T_CD8_density', 'T_CD4_density', 'T_CD20_density', 'T_Plasma_density'), "subtype_marker_file" = '/shahlab/alzhang/projects/ITH_Immune/data/expression/nanostring/subtype_markers.tsv', "mmctm_ov_combined_result_dir" = '/shahlab/alzhang/projects/ITH_Immune/results/mmctm_results/combined_ov_mmctm/output', "ith_stats_file" = '/shahlab/alzhang/projects/ITH_Immune/data/ith/complete/clonal_measures.tsv', "molsubtype_tiltypes" = c('E_CD8_density', 'E_CD4_density', 'E_CD20_density', 'E_Plasma_density', 'S_CD8_density', 'S_CD4_density', 'S_CD20_density', 'S_Plasma_density'), "tils_for_variability" = c('E_CD8_density', 'E_CD4_density', 'E_CD20_density', 'E_Plasma_density', 'S_CD8_density', 'S_CD4_density', 'S_CD20_density', 'S_Plasma_density'), "spatial_result_dirs" = list("epithelial" = '/shahlab/alzhang/pipeline_outputs/ith_immune/spatsim/ith3/abc', "stromal" = '/shahlab/alzhang/pipeline_outputs/ith_immune/spatsim/ith5/abc'), "known_subtypes_merged" = '/shahlab/alzhang/projects/ITH_Immune/data/expression/known_subtypes_merged.tsv', "mmctm_patient_ad_sigplot" = '/shahlab/alzhang/projects/ITH_Immune/results/mmctm_results/ith_by-patient-ancestry/plots/ith-by-patient-ancestry_snv-sv_sigs_multipanel.pdf', "index_notebook" = 'Rmd/index.Rmd', "mapscape_notebook" = 'Rmd/mapscape.Rmd', "bcrphylo_examples_notebook" = 'Rmd/bcr_phylo_examples.Rmd'),
    rule = 'mutation_signature_notebook'
)
######## Original script #########

                        ```


```{r global_chunk_options, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, tidy=TRUE, warning=FALSE, message=FALSE)
```


```{r}
library(ithi.utils)
load_base_libs()
library(survival)
library(rms)
library(edgeR)
library(limma)
library(gage)
library(pathview)
library(DT)
library(ape)
library(phytools)
library(phylobase)

library(ithi.meta)
library(ithi.xcr)
library(ithi.ihc)
library(ithi.seq)
library(ithi.clones)
library(ithi.expr)
library(ithi.external)
```

## Colour palettes

```{r}
pal_patient <- select_palette("patient")
```

## Parameters

```{r}
db_path <- snakemake@params$db

mmctm_sample_result_dir <- snakemake@input$mmctm_sample_dir
mmctm_sample_ad_result_dir <- snakemake@input$mmctm_sample_ad_dir
mmctm_patient_ad_result_dir <- snakemake@input$mmctm_patient_ad_dir
mmctm_ov_combined_result_dir <- snakemake@input$mmctm_ov_combined_dir
mmctm_final_patient_dir <- snakemake@input$mmctm_final_patient_dir

mmctm_sample_sigplot <- snakemake@input$mmctm_sample_sigplot
mmctm_sample_ad_sigplot <- snakemake@input$mmctm_sample_ad_sigplot
mmctm_patient_ad_sigplot <- snakemake@input$mmctm_patient_ad_sigplot
mmctm_ov_combined_sigplot <- snakemake@input$mmctm_ov_combined_sigplot
mmctm_final_patient_sigplot <- snakemake@input$mmctm_final_patient_sigplot

master_variant_file <- snakemake@input$master_variant_file
master_breakpoint_file <- snakemake@input$master_breakpoint_file

ihc_table_path <- snakemake@input$ihc_table
ihc_table_slide_path <- snakemake@input$ihc_table_slide
tiltypes <- snakemake@params$mutsig_tiltypes

tcr_diversity_file <- snakemake@input$tcr_diversity
bcr_diversity_file <- snakemake@input$bcr_diversity

nanostring_data_path <- snakemake@input$nanostring_data
nanostring_annotations_path <- snakemake@input$nanostring_annotations

molecular_subtype_file <- snakemake@input$molecular_subtypes

icgc_expr_mat_file <- snakemake@input$icgc_expr_mat
icgc_specimen_file <- snakemake@input$icgc_specimen
icgc_subtype_file <- snakemake@input$icgc_subtypes
icgc_expr_raw_file <- snakemake@input$icgc_expr_melted
icgc_donor_file <- snakemake@input$icgc_clinical

tcga_expr_mat_file <- snakemake@input$tcga_expr_mat
tcga_ov_annotation_file <- snakemake@input$tcga_ov_annotation
tcga_donor_file <- snakemake@input$tcga_clinical

proportion_subclonality_file <- snakemake@input$subclonality

wang_icgc_fbi_status_file <- snakemake@input$wang_fbi_status

snv_cluster_files <- snakemake@input$snv_cluster_files
snv_cluster_patients <- snakemake@params$snv_cluster_patients

clone_tree_file <- snakemake@input$clone_tree_file
clone_prevalence_file <- snakemake@input$clone_prevalence_file
clone_branch_length_file <- snakemake@input$clone_branch_length_file

ith_icgc_batch_corrected_expression_file <- snakemake@input$ith_icgc_bc
```

## Metadata

```{r}
db <- src_sqlite(db_path, create=FALSE)
samples <- collect(tbl(db, "samples"))
```

```{r}
read_mutsig_output <- function(dir, sample_table, external=FALSE) {
  prop_table_path <- Sys.glob(file.path(dir, "*_props.tsv"))
  sigs_path <- Sys.glob(file.path(dir, "../plots/*_multipanel.pdf"))
  
  prop_table <- fread(prop_table_path)
  
  sample_cols <- colnames(prop_table)[colnames(prop_table) != "signature"]
  ith_cols <- str_detect(sample_cols, "patient")
  
  patients <- str_extract(sample_cols[ith_cols], "(?<=patient_?)[0-9]+")
  site_ids <- str_replace_all(sample_cols[ith_cols], "patient_?[0-9]+_", "")
  
  FROM <- c("rectum_site_1", "rectum_site_2", "rectum_site_3", "rectum_site_4",
            "pelvis_site_1", "pelvis_site_2", "cul_de_sac_site_1")
  TO <- c("rectum_site_1", "rectum_site_1", "rectum_site_2", "rectum_site_2",
          "pelvis_site_1", "pelvis_site_1", "cul-de-sac_site_1")
  
  site_ids_fixed <- mapvalues(site_ids, from = FROM, to=TO)
  df <- data.frame(patient_id=patients, site_id=site_ids_fixed, old_id=sample_cols[ith_cols])
  if (length(df[with(df, patient_id == "11" & site_id == "left_ovary_site_2"),]$site_id) > 0) {
    df[with(df, patient_id == "11" & site_id == "left_ovary_site_2"),]$site_id <- "left_ovary_site_3"
  }
  
  sample_subset <- unique(subset(sample_table, select=c("condensed_id", "patient_id", "site_id")))
  
  df <- plyr::join(df, sample_subset)
  if (nrow(df) > 0) {
    df$is_ancestral <- 0
    df$is_ancestral[df$site_id == "ancestral"] <- 1
  }
  
  #colnames(prop_table)[ith_cols] <- make.unique(df$condensed_id)
  
  prop_final <- melt(prop_table, id.vars = c('signature'), measure.vars = sample_cols,
                     variable.name = "old_id", value.name = "proportion")
  if (external) {
    prop_final_merged <- plyr::join(df, prop_final, by=c("old_id"), type='full')
  } else {
    prop_final_merged <- plyr::join(df, prop_final, by=c("old_id"), type='inner')
  }
  
  prop_final_merged <- prop_final_merged %>% mutate(new_id = ifelse(site_id %in% c("ancestral", "residual"),
                                               yes = paste(patient_id, site_id, sep="_"),
                                               no = condensed_id))
  na_idx <- is.na(prop_final_merged$new_id)
  prop_final_merged$new_id[na_idx] <- as.character(prop_final_merged$old_id[na_idx])
  
  result <- list(prop=prop_final_merged, sigplot=sigs_path)
  return(result)
}

sum_total_variant_counts <- function(variant_sample) {
  variant_sample %>% group_by(condensed_id, patient_id) %>% 
    summarise(snv_count=sum(snv_count), bkpt_count=sum(bkpt_count))
}

compute_mutsig_counts <- function(df) {
  df <- df %>% mutate(is_sv = as.numeric(str_detect(signature, "^SV")),
                sigcount = ifelse(as.character(site_id) == "ancestral",
                  yes = ifelse(is_sv, yes = ancestral_bkpt*proportion, no = ancestral_snv*proportion),
                  no = ifelse(is_sv, yes=bkpt_count * proportion, no = snv_count * proportion)))
  return(df)
}

create_mutsig_matrix <- function(df, col = "proportion") {
  mat <- acast(df, new_id ~ signature, fun.aggregate = mean, value.var = col)
  return(mat)
}
```

```{r}
variant_res <- summarize_variant_counts(master_variant_file, master_breakpoint_file, db_path)
variant_sample <- variant_res$sample
variant_patient <- variant_res$patient

variant_sample_sum <- sum_total_variant_counts(variant_sample)

ihc_table <- fread(ihc_table_path)
ihc_table_subset <- subset(ihc_table, select=c("condensed_id", tiltypes))
ihc_table_slide <- fread(ihc_table_slide_path)
```

## Sample level

### Signatures

![](`r mmctm_sample_sigplot`)


### Results

```{r}
props_sample <- read_mutsig_output(mmctm_sample_result_dir, samples)
props_sample_prop <- subset(props_sample$prop, patient_id != "11")

props_sample_mat <- create_mutsig_matrix(props_sample_prop, col = "proportion")

props_sample_mat_scaled <- clip_values(scale(props_sample_mat), 2, -2)
sample_heat <- pheatmap(props_sample_mat_scaled, fontsize_row = 5, clustering_distance_rows = "correlation", clustering_method = "ward.D2")
```

```{r}
props_sample_merged <- Reduce(function(x,y) plyr::join(x,y),list(props_sample_prop, variant_sample_sum, variant_patient, ihc_table_subset))

props_sample_merged <- compute_mutsig_counts(props_sample_merged)
```

```{r}
COL <- "E_CD8_density"
measure <- "sigcount"

pvals <- setNames(ddply(props_sample_merged, .(signature), function(x) {
  df <- as.data.frame(x)
  df <- df[!is.na(df[,COL]),]
  corres <- summary(lmer(as.formula(paste0(measure, " ~ ", COL, " + (1|patient_id)")), df))
  
  pval <- unname(corres$coefficients[,5][2])
  eq <- substitute(italic(P)==p, list(p=format(pval, digits=3)))
  return(as.character(as.expression(eq)))
}), c("signature", "p.value"))

ggplot(props_sample_merged, aes_string(x=COL, y=measure)) + geom_point(aes(colour=patient_id)) + 
  scale_colour_manual(values = pal_patient) + facet_wrap(~ signature, scales="free") + 
  geom_text(data=pvals, aes(x=Inf, y=Inf, label=p.value), hjust=1.1, vjust=1.5,size=3,parse=TRUE) + 
  theme_bw()
```

```{r}
# x <- Reduce(function(x,y) merge(x,y, by=c("condensed_id")), list(rownames_to_column(as.data.frame(t(prop_matrix)), var = "condensed_id"), 
#            ihc_table_subset))
# x$patient_id <- map_id(x$condensed_id, from = "condensed_id", to="patient_id", db_path)
# x <- subset(x, patient_id != "11")
# 
# cols <- colnames(x)[!colnames(x) %in% c("condensed_id", "patient_id")]
# rmat <- matrix(nrow=length(cols),ncol=length(cols))
# pmat <- matrix(nrow=length(cols),ncol=length(cols))
# 
# for (i in 1:length(cols)) {
#   for (j in i:length(cols)) {
#     if (i != j) {
#       col1 <- cols[i]
#       col2 <- cols[j]
#       formula <- paste0("`", col2, "`", "~", "`", col1, "`", "+ (1|patient_id)", sep=" ")
#       res <- summary(lmer(formula, x))
#       pval <- tryCatch({
#         unname(res$coefficients[,5][2])
#       }, error = function(e) {
#         NA
#       })
#       r <- unname(res$coefficients[,1][2])
#       rmat[i,j] <- rmat[j,i] <- r
#       pmat[i,j] <- pmat[j,i] <- pval
#     } else {
#       rmat[i,j] <- 1
#       pmat[i,j] <- NA
#     }
#   }
# }

```

```{r}
# resmat <- log10(pmat)*-sign(rmat)
# resmat[resmat == Inf] <- NA
# rownames(resmat) <- colnames(resmat) <- cols
# 
# pheatmap(resmat, display_numbers = signif(pmat, 3), cluster_rows = FALSE, cluster_cols = FALSE, fontsize = 5)
```

```{r}
# xmat <- x %>% select(-one_of("condensed_id", "patient_id"))
# 
# cormat <- corr.test(xmat, method="spearman", adjust="fdr")
# pheatmap(cormat$r, display_numbers = signif(cormat$p, 3), fontsize = 5)
```


## Ancestral-descendant level (samples)

### Signatures

![](`r mmctm_sample_ad_sigplot`)

### Results

```{r}
props_ad <- read_mutsig_output(mmctm_sample_ad_result_dir, samples)
props_ad_prop <- subset(props_ad$prop, patient_id != "11")

props_ad_mat <- create_mutsig_matrix(props_ad_prop, col = "proportion")

props_ad_mat_scaled <- clip_values(scale(props_ad_mat), 2, -2)
patient_heat <- pheatmap(props_ad_mat_scaled, fontsize_row = 5, clustering_distance_rows = "correlation")
```

```{r}
props_ad_merged <- Reduce(function(x,y) plyr::join(x,y),list(props_ad_prop, subset(variant_sample, is_ancestral == 0), variant_patient, ihc_table_subset))

props_ad_merged <- compute_mutsig_counts(props_ad_merged)
```

```{r}
COL <- "E_CD8_density"
measure <- "sigcount"

props_ad_merged_descendant <- subset(props_ad_merged, site_id != "ancestral")

pvals <- setNames(ddply(props_ad_merged_descendant, .(signature), function(x) {
  df <- as.data.frame(x)
  df <- df[!is.na(df[,COL]),]
  corres <- summary(lmer(as.formula(paste0(measure, " ~ ", COL, " + (1|patient_id)")), df))
  
  pval <- unname(corres$coefficients[,5][2])
  eq <- substitute(italic(P)==p, list(p=format(pval, digits=3)))
  return(as.character(as.expression(eq)))
}), c("signature", "p.value"))

ggplot(props_ad_merged_descendant, aes_string(x=COL, y=measure)) + geom_point(aes(colour=patient_id)) + 
  scale_colour_manual(values = pal_patient) + facet_wrap(~ signature, scales="free") + 
  geom_text(data=pvals, aes(x=Inf, y=Inf, label=p.value), hjust=1.1, vjust=1.5,size=3,parse=TRUE) + 
  theme_bw()
```

What we can also see from these plots is that not resolving SNV-7 (the noise cluster) is going to be a problem. The signature proportion values from that cluster are fairly similar to those from SNV-6, the APOBEC signature. 


## Ancestral-descendant level (patients)

### Signatures

![](`r mmctm_patient_ad_sigplot`)

### Results

```{r}
props_patient_ad <- read_mutsig_output(mmctm_patient_ad_result_dir, samples)
props_patient_ad_prop <- subset(props_patient_ad$prop, patient_id != "11")

props_patient_ad_mat <- create_mutsig_matrix(props_patient_ad_prop, col = "proportion")

props_patient_ad_mat_scaled <- clip_values(scale(props_patient_ad_mat), 2, -2)
patient_ad_heat <- pheatmap(props_patient_ad_mat_scaled, fontsize_row = 5, clustering_distance_rows = "correlation")
```

```{r}
props_patient_ad_merged <- Reduce(function(x,y) plyr::join(x,y),list(props_patient_ad_prop, subset(variant_sample, is_ancestral == 0), variant_patient))

props_patient_ad_merged <- compute_mutsig_counts(props_patient_ad_merged)
```


## Correlation matrices

```{r}
ihc_mat <- as.data.frame(ihc_table_subset %>% dplyr::select(-condensed_id))
rownames(ihc_mat) <- ihc_table_subset$condensed_id

#TOTAL_TIL_COLS <- c("T_CD8_density", "T_CD4_density", "T_CD20_density", "T_Plasma_density")
#ihc_mat <- subset(ihc_mat, select=TOTAL_TIL_COLS)
```

```{r}
compute_ihc_mutsig_cor <- function(ihc_mat, mutsig_mat, patient_summary = FALSE, ancestral = FALSE) {
  if (patient_summary) {
    ihc_mat <- rownames_to_column(as.data.frame(ihc_mat), "id")
    mutsig_mat <- rownames_to_column(as.data.frame(mutsig_mat), "id")
    
    if (ancestral) {
      mutsig_mat <- subset(mutsig_mat, str_detect(id, "ancestral"))
    } else {
      mutsig_mat <- subset(mutsig_mat, !str_detect(id, "ancestral"))
    }
    
    ihc_mat$patient_id <- str_extract(ihc_mat$id, "^[0-9]+")
    mutsig_mat$patient_id <- str_extract(mutsig_mat$id, "^[0-9]+")
    
    ihc_mat <- ihc_mat %>% dplyr::select(-id) %>% group_by(patient_id) %>% summarise_each(funs(mean(., na.rm=TRUE))) %>% column_to_rownames(var = "patient_id")
    mutsig_mat <- mutsig_mat %>% dplyr::select(-id) %>% group_by(patient_id) %>% summarise_each(funs(mean(., na.rm=TRUE))) %>% column_to_rownames(var = "patient_id")
    
    ihc_mat <- as.matrix(ihc_mat)
    mutsig_mat <- as.matrix(mutsig_mat)
  }
  
  intersect_samples <- intersect(rownames(mutsig_mat), rownames(ihc_mat))
  mutsig <- mutsig_mat[intersect_samples,,drop=FALSE]
  ihc <- ihc_mat[intersect_samples,,drop=FALSE]
  
  pmat <- matrix(nrow=ncol(mutsig), ncol=ncol(ihc))
  rmat <- matrix(nrow=ncol(mutsig), ncol=ncol(ihc))
  rownames(pmat) <- rownames(rmat) <- colnames(mutsig)
  colnames(pmat) <- colnames(rmat) <- colnames(ihc)
  
  for (i in 1:ncol(mutsig)) {
    for (j in 1:ncol(ihc)) {
      corres <- cor.test(mutsig[,i], ihc[,j], method="spearman")
      pmat[i,j] <- corres$p.value
      rmat[i,j] <- corres$estimate
    }
  }
  
  return(list(p=pmat, r=rmat))
}
```

For adjusted p-values just run `p.adjust.mat` on the p-value labels. 

### Sample-level

```{r}
ihc_sample_cor <- compute_ihc_mutsig_cor(ihc_mat, props_sample_mat)

pheatmap(ihc_sample_cor$r, display_numbers = signif(ihc_sample_cor$p, 3))

ihc_sample_cor_summary <- compute_ihc_mutsig_cor(ihc_mat, props_sample_mat, patient_summary = TRUE)

pheatmap(ihc_sample_cor_summary$r, display_numbers = signif(ihc_sample_cor_summary$p, 3))
```

### Ancestral-descendant level

```{r}
ihc_ad_cor <- compute_ihc_mutsig_cor(ihc_mat, props_ad_mat)

pheatmap(ihc_ad_cor$r, display_numbers = signif(ihc_ad_cor$p, 3))

ihc_ad_cor_summary <- compute_ihc_mutsig_cor(ihc_mat, props_ad_mat, patient_summary = TRUE)

pheatmap(ihc_ad_cor_summary$r, display_numbers = signif(ihc_ad_cor_summary$p, 3))

ihc_ad_cor_summary_ancestral <- compute_ihc_mutsig_cor(ihc_mat, props_ad_mat, patient_summary = TRUE, ancestral = TRUE)

pheatmap(ihc_ad_cor_summary_ancestral$r, display_numbers = signif(ihc_ad_cor_summary_ancestral$p, 3))
```

Note: p-values shown are uncorrected. Patient-summarized correlations are insignificant after FDR correction. 

## Cluster-level analysis

Finding correlations at the level of individual signatures can be difficult. Even moreso because some published signatures are combinations of these signatures -- e.g. SV-3 and SV-6 are both foldback signatures in the ancestral-descendant analysis. 

Hence, we can look at the level of clusters from our heatmaps. 

```{r}
make_cluster_frame <- function(clusters) {
  clusts <- rownames_to_column(as.data.frame(clusters), "new_id")
  colnames(clusts)[2] <- "cluster"
  clusts$cluster <- factor(clusts$cluster)
  return(clusts)
}

NCLUST <- 2
selected_cols <- c("new_id", "condensed_id", "patient_id", "old_id", "cluster", tiltypes)
```

### Sample-level

```{r}
clusters <- cutree(sample_heat$tree_row, NCLUST)
sample_clusts <- make_cluster_frame(clusters)

props_sample_merged_clusts <- join(props_sample_merged, sample_clusts)
sample_df <- unique(subset(props_sample_merged_clusts, select=selected_cols))

sample_df_melted <- melt(sample_df, id.vars = colnames(sample_df)[!colnames(sample_df) %in% tiltypes], measure.vars = tiltypes, variable.name = "tiltype", value.name = "density")
```

```{r}
pvals <- setNames(ddply(sample_df_melted, .(tiltype), function(x) {
  df <- as.data.frame(x)
  corres <- wilcox.test(density ~ cluster, df)
  
  pval <- corres$p.value
  eq <- substitute(italic(P)==p, list(p=format(pval, digits=3)))
  return(as.character(as.expression(eq)))
}), c("tiltype", "p.value"))


ggplot(sample_df_melted, aes(x=cluster, y=density)) + geom_boxplot() + theme_bw() + theme_Publication() +  geom_jitter(aes(colour=patient_id), position = position_jitter(width=0.2, height=0)) + scale_color_manual(values = pal_patient) + facet_wrap(~ tiltype, scales = "free") + 
  geom_text(data=pvals, aes(x=Inf, y=Inf, label=p.value), hjust=1.1, vjust=1.5,size=3,parse=TRUE)
```


```{r}
diversity_column <- "observedDiversity_mean"

diversity_files <- list(tcr=tcr_diversity_file, bcr=bcr_diversity_file)

diversity <- lapply(names(diversity_files), function(segment) {
  f <- diversity_files[[segment]]
  xcr_diversity <- read_xcr_diversity_file(f, db_path)
  
  df <- subset(xcr_diversity, select=c("condensed_id", "patient_id", diversity_column))
  colnames(df) <- mapvalues(colnames(df), from=diversity_column, to=segment)
  return(df)
})
names(diversity) <- names(diversity_files)

xcr_diversity <- Reduce(plyr::join, diversity)

XCR_VARS <- c("tcr", "bcr")
```


```{r}
molsubtypes <- fread(molecular_subtype_file)
molsubtypes <- setNames(molsubtypes, c("condensed_id", "subtype"))

proportion_subclonality <- fread(proportion_subclonality_file)
proportion_subclonality_subset <- subset(proportion_subclonality, select=c("condensed_id", "proportion_subclonal"))

exprs <- fread(nanostring_data_path)
labels <- fread(nanostring_annotations_path)

celltype_matrix <- create_celltype_matrix(exprs, labels, db_path)
pathway_matrix <- create_pathway_matrix(exprs, labels, db_path)

celltype_df <- rownames_to_column(as.data.frame(t(celltype_matrix)), var = "condensed_id")
pathway_df <- rownames_to_column(as.data.frame(t(pathway_matrix)), var = "condensed_id")

NANOSTRING_VARS <- c(rownames(celltype_matrix), rownames(pathway_matrix))
```



```{r}
colnames(sample_clusts) <- mapvalues(colnames(sample_clusts), "new_id", "condensed_id")

ihc_stabilize_subset <- stabilize_ihc_variances(ihc_table_slide, ihc_table, tiltypes)

data_matrices <- list(sample_clusts, ihc_stabilize_subset, xcr_diversity, celltype_df, pathway_df, proportion_subclonality_subset)
data_matrices <- lapply(data_matrices, function(x) {
  x %>% dplyr::select(-one_of("patient_id"))
})

combined_df <- Reduce(function(x, y) plyr::join(x,y,type='full'), data_matrices)
combined_df <- subset(combined_df, !is.na(cluster) & !condensed_id %in% c("7_BrnM"))

combined_mat <- subset(combined_df, select=-c(condensed_id, cluster))
rownames(combined_mat) <- combined_df$condensed_id
```

```{r, fig.width=8, fig.height=8}
#ref_dendrogram <- prune(as.dendrogram(sample_heat$tree_row), "7_BrnM")

sample_order <- sample_heat$tree_row$labels[sample_heat$tree_row$order]
sample_order <- intersect(sample_order, rownames(combined_mat))

cluster_annotations <- subset(combined_df, select=c(condensed_id, cluster))

SIGS <- c("SV-3", "SNV-5", "SV-1", "SV-2", "SNV-1")
sig_annotations <- rownames_to_column(data.frame(props_sample_mat[,SIGS]), var = "condensed_id")

## Take logarithms to ~variance stabilize
variant_count_annotations <- variant_sample %>% group_by_(.dots="condensed_id") %>% 
  summarise(snv_count=log(sum(snv_count)), bkpt_count=log(sum(bkpt_count))) %>%
  subset(select=c("condensed_id", "snv_count", "bkpt_count"))

row_annotations <- Reduce(plyr::join, list(cluster_annotations, molsubtypes, sig_annotations,
                                           variant_count_annotations))
row_annotations <- row_annotations %>% column_to_rownames(var = "condensed_id")

combined_mat_scaled <- scale(combined_mat)

# hcs <- lapply(unique(combined_df$cluster), function(clust) {
#   ids <- subset(combined_df, cluster == clust)$condensed_id
#   dists <- dist(combined_mat_scaled[ids,], method = "euclidean")
#   #dists <- proxy::dist(combined_mat_scaled[ids,], method = function(x,y) pairwise_dist(x,y,method="canberra"))
#   hclust(dists, method = "ward.D")
# })
# min_height <- max(sapply(hcs, function(x) max(unique(cophenetic(x)))))*1.1
# 
# hc <- Reduce(function(x,y) merge(as.dendrogram(x),as.dendrogram(y),height = min_height), hcs)
# hc <- as.dendrogram(ref_dendrogram)
# hc <- as.dendrogram(sample_heat2$tree_row)
# 
# ha <- HeatmapAnnotation(row_annotations[rownames(combined_mat_scaled),], which="row")
# 
# Heatmap(clip_values(combined_mat_scaled, 2, -2), cluster_rows = hc2, cluster_columns = TRUE, split = 2, clustering_method_columns = "ward.D2") + ha

pheatmap(clip_values(combined_mat_scaled, 2, -2)[sample_order,], cluster_rows = FALSE, cluster_cols = TRUE, annotation_row = row_annotations, fontsize = 6)
```

What we can see is:

* H-HRD cluster is relatively homogeneous.
* Looks like there are two clusters within the H-FBI group -- one characterized by high levels of immune activity (cytotoxicity, etc.) and one characterized by low levels. 

This is a pretty significant finding. 

Additionally, an interesting thing is that the patients/samples with the highest proportions of foldbacks -- patients 2, 3, and 9 -- are actually the ones with high immune response in the foldback group! Suggestive that perhaps foldback inversions can create neoepitopes. Of course very preliminary and low sample size though. 

If you're wondering why there's no dendrogram on the vertical axis it's because the plotting functions I'm currently using don't allow for self-specified dendrograms. Trying to make one that lets me do so but it's taking a bit of acrobatics and I've wasted a lot of time already ... 

To see ICGC validation, go to that section. I'll add a link later ...

#### TCR/BCR diversity

```{r}
combined_df$patient_id <- map_id(combined_df$condensed_id, from = "condensed_id", to="patient_id", db_path)
combined_df_xcr <- melt(combined_df, id.vars = c("condensed_id", "patient_id", "cluster"), measure.vars = XCR_VARS, variable.name = "type", value.name = "value")

pvals <- setNames(ddply(combined_df_xcr, .(type), function(x) {
  df <- as.data.frame(x)
  corres <- wilcox.test(value ~ cluster, df)
  
  pval <- corres$p.value
  eq <- substitute(italic(P)==p, list(p=format(pval, digits=3)))
  return(as.character(as.expression(eq)))
}), c("type", "p.value"))


ggplot(combined_df_xcr, aes(x=cluster, y=value)) + geom_boxplot() + theme_bw() + theme_Publication() +  geom_jitter(aes(colour=patient_id), position = position_jitter(width=0.2, height=0)) + scale_color_manual(values = pal_patient) + facet_wrap(~ type, scales = "free") + 
  geom_text(data=pvals, aes(x=Inf, y=Inf, label=p.value), hjust=1.1, vjust=1.5,size=3,parse=TRUE)
```

#### Celltypes and pathways

```{r, fig.width=10, fig.height=10}
combined_df$patient_id <- map_id(combined_df$condensed_id, from = "condensed_id", to="patient_id", db_path)
combined_df_nanostring <- melt(combined_df, id.vars = c("condensed_id", "patient_id", "cluster"), measure.vars = NANOSTRING_VARS, variable.name = "type", value.name = "value")

pvals <- setNames(ddply(combined_df_nanostring, .(type), function(x) {
  df <- as.data.frame(x)
  corres <- wilcox.test(value ~ cluster, df)
  
  pval <- corres$p.value
  eq <- substitute(italic(P)==p, list(p=format(pval, digits=3)))
  return(as.character(as.expression(eq)))
}), c("type", "p.value"))


ggplot(combined_df_nanostring, aes(x=cluster, y=value)) + geom_boxplot() + theme_bw() + theme_Publication() +  geom_jitter(aes(colour=patient_id), position = position_jitter(width=0.2, height=0)) + scale_color_manual(values = pal_patient) + facet_wrap(~ type, scales = "free") + 
  geom_text(data=pvals, aes(x=Inf, y=Inf, label=p.value), hjust=1.1, vjust=1.5,size=3,parse=TRUE)
```

### Ancestral-descendant level

```{r}
clusters <- cutree(patient_heat$tree_row, NCLUST)
patient_clusts <- make_cluster_frame(clusters)

props_ad_merged_clusts <- join(props_ad_merged, patient_clusts)
patient_df <- unique(subset(props_ad_merged_clusts, select=selected_cols))

patient_df_melted <- melt(patient_df, id.vars = colnames(patient_df)[!colnames(patient_df) %in% tiltypes], measure.vars = tiltypes, variable.name = "tiltype", value.name = "density")
```

```{r}
pvals <- setNames(ddply(patient_df_melted, .(tiltype), function(x) {
  df <- as.data.frame(x)
  corres <- wilcox.test(density ~ cluster, df)
  
  pval <- corres$p.value
  eq <- substitute(italic(P)==p, list(p=format(pval, digits=3)))
  return(as.character(as.expression(eq)))
}), c("tiltype", "p.value"))


ggplot(patient_df_melted, aes(x=cluster, y=density)) + geom_boxplot() + theme_bw() + theme_Publication() +  geom_jitter(aes(colour=patient_id), position = position_jitter(width=0.2, height=0)) + scale_color_manual(values = pal_patient) + facet_wrap(~ tiltype, scales = "free") + 
  geom_text(data=pvals, aes(x=Inf, y=Inf, label=p.value), hjust=1.1, vjust=1.5,size=3,parse=TRUE)
```


Hence, the foldback group (cluster 2) has lower CD8+ and CD4+ densities than the HRD group, as expected. 

A caveat is that p-values are computed with Wilcoxon, irrespective of patient number. We can't really opt for a nonparametric nested ranks test because very few patients (4) are actually present in both clusters. 

## Ancestral analysis

Ancestral variants may have different properties from descendant variants. 

### Sample-specific

Here we'll just naively allow for subclonal variants to be counted multiple times. 

```{r}
props_ad_merged$is_ancestral <- as.factor(props_ad_merged$is_ancestral)

pvals <- setNames(ddply(props_ad_merged, .(signature), function(x) {
  df <- as.data.frame(x)
  corres <- wilcox.test(proportion ~ is_ancestral, df)
  
  pval <- corres$p.value
  eq <- substitute(italic(P)==p, list(p=format(pval, digits=3)))
  return(as.character(as.expression(eq)))
}), c("signature", "p.value"))

ggplot(props_ad_merged, aes(x=is_ancestral, y=proportion)) + geom_boxplot() + geom_jitter(aes(colour=patient_id), position=position_jitter(width=0.2, height=0)) + theme_bw() + theme_Publication() + scale_colour_manual(values = pal_patient) + facet_wrap(~ signature, scales= "free") + 
  geom_text(data=pvals, aes(x=Inf, y=Inf, label=p.value), hjust=1.1, vjust=1.5,size=3,parse=TRUE)
```

P-values are uncorrected. 

Unsurprisingly, ancestral samples have significantly more of SNV-4, the age signature. Insignificant, but they also have less SV-1, which is one of the BRCA's (small deletions). 

### Union

Here we'll actually take the union of subclonal variants for each patient so we don't overcount. 

```{r}
props_patient_ad_merged$is_ancestral <- as.factor(props_patient_ad_merged$is_ancestral)

pvals <- setNames(ddply(props_patient_ad_merged, .(signature), function(x) {
  df <- as.data.frame(x)
  corres <- wilcox.test(proportion ~ is_ancestral, df, paired=TRUE)
  
  pval <- corres$p.value
  eq <- substitute(italic(P)==p, list(p=format(pval, digits=3)))
  return(as.character(as.expression(eq)))
}), c("signature", "p.value"))

ggplot(props_patient_ad_merged, aes(x=is_ancestral, y=proportion)) + geom_boxplot() + geom_jitter(aes(colour=patient_id), position=position_jitter(width=0.2, height=0)) + theme_bw() + theme_Publication() + scale_colour_manual(values = pal_patient) + facet_wrap(~ signature, scales= "free") + 
  geom_text(data=pvals, aes(x=Inf, y=Inf, label=p.value), hjust=1.1, vjust=1.5,size=3,parse=TRUE)
```

Aside from age and HRD which were described in McPherson et al., there's additionally the SV-3 signature -- a translocation signature. Implying that translocations are early (ancestral) events in HGSC -- perhaps this is interesting? I have to do a literature search. 


```{r}
anc_desc_patient <- dcast(props_patient_ad_merged, formula = patient_id + signature ~ is_ancestral, value.var = "proportion")

pvals <- setNames(ddply(anc_desc_patient, .(signature), function(x) {
  df <- as.data.frame(x)
  corres <- with(df, cor.test(`0`, `1`, method="spearman"))
  
  pval <- corres$p.value
  eq <- substitute(italic(P)==p, list(p=format(pval, digits=3)))
  return(as.character(as.expression(eq)))
}), c("signature", "p.value"))

ggplot(anc_desc_patient, aes(x=`1`, y=`0`)) + geom_point() + geom_point(aes(colour=patient_id)) + theme_bw() + theme_Publication() + scale_colour_manual(values = pal_patient) + facet_wrap(~ signature, scales= "free") + geom_text(data=pvals, aes(x=Inf, y=Inf, label=p.value), hjust=1.1, vjust=1.5,size=3,parse=TRUE)
```

## ICGC validation

### Signatures

![](`r mmctm_ov_combined_sigplot`)

### FBI subclusters

```{r}
specimen_data <- fread(icgc_specimen_file)
icgc_subtypes <- fread(icgc_subtype_file)
icgc_expr_mat <- fread(icgc_expr_mat_file)
icgc_expr_df <- icgc_expr_mat %>% as.data.frame %>% column_to_rownames("icgc_donor_id") %>% t %>% as.data.frame %>% rownames_to_column("Name")
icgc_celltype_matrix <- create_pathway_matrix(icgc_expr_df, labels, db_path, convert_ids = FALSE)

icgc_immune <- rownames_to_column(as.data.frame(t(icgc_celltype_matrix)), var = "icgc_donor_id")

# cytotoxic_markers <- c("PRF1", "GZMA", "HLA-A", "HLA-B", "HLA-C", "GZMK", "GZMM", "GZMH", "GNLY", "GZMB")
# icgc_immune <- as.data.frame(subset(icgc_expr_mat, select=c("icgc_donor_id", cytotoxic_markers)))
# for (i in 2:ncol(icgc_immune)) {
#  icgc_immune[,i] <- log10(icgc_immune[,i])
# }
```

```{r, fig.height=12}
props_ov_combined <- read_mutsig_output(mmctm_ov_combined_result_dir, samples, external = TRUE)
props_ov_combined_prop <- props_ov_combined$prop

props_ov_combined_mat <- create_mutsig_matrix(props_ov_combined_prop, col = "proportion")

# props_ov_combined_mat_scaled <- as.data.frame(apply(props_ov_combined_mat, 2, function(x) {
#   #x <- logit(x)
#   (x-median(x,na.rm=TRUE))/mad(x,na.rm=TRUE)
# }))

props_ov_combined_mat_scaled <- scale(props_ov_combined_mat)

#props_ov_combined_mat_scaled <- clip_values(props_ov_combined_mat_scaled, 3, -3)
#clip_values(scale(props_ov_combined_mat), 2, -2)

ov_combined_heat <- pheatmap(props_ov_combined_mat_scaled, fontsize_row = 6, clustering_distance_rows = "correlation", clustering_method = "ward.D2")
```

```{r}
NCLUST <- 3

clusters <- cutree(ov_combined_heat$tree_row, NCLUST)
ov_combined_clusts <- make_cluster_frame(clusters)

colnames(icgc_immune) <- mapvalues(colnames(icgc_immune), "icgc_donor_id", "new_id")

data_matrices <- list(ov_combined_clusts, icgc_immune)
data_matrices <- lapply(data_matrices, function(x) {
  x %>% dplyr::select(-one_of("patient_id"))
})

combined_df <- Reduce(function(x, y) plyr::join(x,y,type='inner'), data_matrices)
#combined_df <- subset(combined_df, cluster == 1)

combined_mat <- subset(combined_df, select=-c(new_id, cluster))
rownames(combined_mat) <- combined_df$new_id
```

```{r, fig.width=8, fig.height=8}
sample_order <- ov_combined_heat$tree_row$labels[ov_combined_heat$tree_row$order]
sample_order <- intersect(sample_order, rownames(combined_mat))

cluster_annotations <- subset(combined_df, select=c(new_id, cluster))

SIGS <- c("SV-8", "SNV-1", "SV-4", "SV-7", "SV-5", "SNV-7", "SV-1")
sig_annotations <- rownames_to_column(data.frame(props_ov_combined_mat[,SIGS], check.names = FALSE), var = "new_id")

subtype_annotations <- subset(icgc_subtypes, select=c("icgc_donor_id", "subtype", "nmf_subtype"))
colnames(subtype_annotations)[1] <- "new_id"

row_annotations <- Reduce(plyr::join, list(cluster_annotations, sig_annotations, subtype_annotations))
row_annotations <- row_annotations %>% column_to_rownames(var = "new_id")

select_rows <- rownames(subset(row_annotations, `SV-4` < 1))
notx <- subset(specimen_data, str_detect(specimen_type, "Primary") & specimen_donor_treatment_type == "no treatment")$icgc_donor_id
sample_order <- intersect(sample_order, select_rows)
sample_order <- intersect(sample_order, notx)

#order_fbi <- order(row_annotations[rownames(combined_mat_scaled),]$`SV-8`)

#combined_mat_scaled <- scale(combined_mat)
combined_mat_scaled <- as.data.frame(apply(combined_mat, 2, function(x) (x-median(x,na.rm=TRUE))/mad(x,na.rm=TRUE)))

combined_mat_scaled <- combined_mat_scaled[sample_order,]
other_mat <- props_ov_combined_mat_scaled[sample_order,]

pheatmap(clip_values(cbind(combined_mat_scaled, other_mat), 2, -2), cluster_rows = FALSE, cluster_cols = TRUE, annotation_row = row_annotations, fontsize = 6, clustering_distance_cols = "correlation", clustering_method = "ward.D2")

#rowmean <- rowMeans(combined_mat_scaled)
#a <- merge(row_annotations, as.data.frame(rowmean), by="row.names")
#ggplot(subset(a, cluster != 1), aes(x=rowmean > median(rowmean), y = `SNV-7`)) + geom_boxplot() + theme_bw()
```

```{r}
clusters <- setNames(cluster_annotations, c("icgc_donor_id", "cluster"))
```

### Differential expression

```{r}
icgc_expr_melted <- fread(icgc_expr_raw_file)
icgc_expr_casted_matrix <- expression_df_to_matrix(icgc_expr_melted, summarize_over = "icgc_donor_id", measure_var = "raw_read_count")

icgc_counts <- icgc_expr_casted_matrix %>% column_to_rownames(var = "icgc_donor_id") 
icgc_counts_filtered <- icgc_counts[clusters$icgc_donor_id,]
icgc_counts_filtered <- icgc_counts_filtered %>% t %>% as.data.frame

dge <- DGEList(counts = icgc_counts_filtered)
dge <- calcNormFactors(dge)

library_sizes <- colSums(icgc_counts_filtered)
library_sizes
max(library_sizes)/min(library_sizes)

design <- model.matrix(~0 + factor(clusters$cluster))
colnames(design) <- paste0("clust", 1:NCLUST)
contrast.matrix <- makeContrasts(clust1-clust3, clust2-(clust1+clust3)/2,clust2-clust1, levels=design)
```

#### limma-trend

We straddle close to threshold for doing this but let's do it anyways. 

```{r}
fit_trend <- rnaseq_DE_initial_fit(dge, design, type = "limma_trend")
```

```{r}
fit_trend_contrasts <- contrasts.fit(fit_trend, contrast.matrix)
fit_trend_contrasts <- eBayes(fit_trend_contrasts)

hrd_fbi_genes <- topTable(fit_trend_contrasts, coef = "clust2 - (clust1 + clust3)/2", adjust.method = "BH", number = Inf, sort="p")
fbi_immune_genes <- topTable(fit_trend_contrasts, coef = "clust1 - clust3", number = Inf, sort="p")
hrd_fbi_highimmune_genes <- topTable(fit_trend_contrasts, coef = "clust2 - clust1", number = Inf, sort="p")
```

```{r}
hrd_fbi_pathways <- pathway_analysis(hrd_fbi_genes, gs_type = "go")
fbi_immune_pathways <- pathway_analysis(fbi_immune_genes, gs_type = "go")
hrd_fbi_highimmune_pathways <- pathway_analysis(hrd_fbi_highimmune_genes, gs_type = "go")
```

##### HRD vs. FBI samples (clust2 vs. clust1+clust3)

```{r}
datatable(hrd_fbi_pathways, extensions = "Buttons", options=list(pageLength=5, scrollX=TRUE,dom='Bfrtip',buttons = c('copy', 'csv', 'excel', 'pdf', 'print')))
```



##### High immune FBI vs. low immune FBI (clust1 vs. clust3)

```{r}
datatable(fbi_immune_pathways, extensions = "Buttons", options=list(pageLength=5, scrollX=TRUE,dom='Bfrtip',buttons = c('copy', 'csv', 'excel', 'pdf', 'print')))
```


##### HRD vs. high immune FBI (clust2 vs. clust1)

```{r}
datatable(hrd_fbi_highimmune_pathways, extensions = "Buttons", options=list(pageLength=5, scrollX=TRUE,dom='Bfrtip',buttons = c('copy', 'csv', 'excel', 'pdf', 'print')))
```

#### voom

We have greater than 3-fold variability in library size, so we're better off using the voom method for DE analysis. 

```{r}
fit_voom <- rnaseq_DE_initial_fit(dge, design, type = "voom")
```

```{r}
fit_voom_contrasts <- contrasts.fit(fit_voom, contrast.matrix)
fit_voom_contrasts <- eBayes(fit_voom_contrasts)

hrd_fbi_genes <- topTable(fit_voom_contrasts, coef = "clust2 - (clust1 + clust3)/2", adjust.method = "BH", number =Inf, sort="p")
fbi_immune_genes <- topTable(fit_voom_contrasts, coef = "clust1 - clust3", number = Inf, sort="p")
hrd_fbi_highimmune_genes <- topTable(fit_voom_contrasts, coef = "clust2 - clust1", number = Inf, sort="p")
```


```{r, message=TRUE, warning=TRUE}
hrd_fbi_pathways <- pathway_analysis(hrd_fbi_genes, gs_type = "kegg")
fbi_immune_pathways <- pathway_analysis(fbi_immune_genes, gs_type = "kegg")
hrd_fbi_highimmune_pathways <- pathway_analysis(hrd_fbi_highimmune_genes, gs_type = "kegg")
# pv.out.list <- sapply(path.ids2, function(pid) pathview(
#                       gene.data =  exp.fc, pathway.id = pid,
#                       species = "hsa", out.suffix=out.suffix))
```

**Note:** BRCA1, BRCA2, and RPA are also downregulated in the HRD group -- the entire HR-associated gene cluster isn't significantly downregulated though. 

One of the key reasons why the entire HR pathway isn't significantly downregulated is because PolQ is part of it and PolQ is upregulated. Of note, PolQ promotes MMEJ, providing further evidence for MMEJ activity in FBI tumours. 

##### HRD vs. FBI samples (clust2 vs. clust1+clust3)

```{r}
datatable(hrd_fbi_pathways, extensions = "Buttons", options=list(pageLength=5, scrollX=TRUE,dom='Bfrtip',buttons = c('copy', 'csv', 'excel', 'pdf', 'print')))
```



##### High immune FBI vs. low immune FBI (clust1 vs. clust3)

```{r}
datatable(fbi_immune_pathways, extensions = "Buttons", options=list(pageLength=5, scrollX=TRUE,dom='Bfrtip',buttons = c('copy', 'csv', 'excel', 'pdf', 'print')))
```


##### HRD vs. high immune FBI (clust2 vs. clust1)

```{r}
datatable(hrd_fbi_highimmune_pathways, extensions = "Buttons", options=list(pageLength=5, scrollX=TRUE,dom='Bfrtip',buttons = c('copy', 'csv', 'excel', 'pdf', 'print')))
```


#### Some DNA repair genes

```{r}
v <- voom(icgc_counts_filtered, design, normalize="quantile", plot=FALSE)
```

```{r}
icgc_expr_df_wide <- v$E %>% t %>% as.data.frame %>% rownames_to_column("icgc_donor_id")

icgc_expr_df_melted <- reshape2::melt(icgc_expr_df_wide, id.vars = c("icgc_donor_id"), measure.vars = colnames(icgc_expr_df_wide)[2:ncol(icgc_expr_df_wide)], variable.name = "Name", value.name = "expression")

icgc_expr_df_melted_labeled <- plyr::join(icgc_expr_df_melted, clusters)

REPAIR_GENES <- c("BRCA1", "BRCA2", "POLQ", "FEN1", "XRCC1", "PARP1", "LIG3")

icgc_expr_df_melted_labeled_filtered <- subset(icgc_expr_df_melted_labeled, Name %in% REPAIR_GENES)

ggplot(subset(icgc_expr_df_melted_labeled_filtered, !is.na(cluster)), aes(x=cluster, y=expression)) + geom_boxplot() + geom_jitter(position=position_jitter(width=0.2, height=0)) + theme_bw() + theme_Publication() + facet_wrap(~ Name, scales = "free")
```

### Survival

```{r}
icgc_clinical <- read_donor_specimen_survival(icgc_donor_file, icgc_specimen_file)

icgc_clinical_labeled <- plyr::join(clusters, icgc_clinical)

icgc_clinical_labeled$SurvObj <- with(icgc_clinical_labeled, Surv(donor_survival_time, donor_vital_status == "deceased"))
```

Survival by FBI status ...

```{r}
simple_survival_analysis(SurvObj ~ cluster != 2, data = icgc_clinical_labeled)
```

```{r}
simple_survival_analysis(SurvObj ~ cluster, data = icgc_clinical_labeled)
```

## TCGA validation

While we can't directly do mutation signature analysis with MMCTM on exome data (I suppose this is theoretically possible but we'd probably be restricted in the types of events we can call, like long SVs), we can look at the FBI-HLAMP finding from Yikan's paper and see if that lines up with immune signatures. 

```{r}
tcga_expr_mat <- read_tcga_exprs(tcga_expr_mat_file)
tcga_ov_annotation <- fread(tcga_ov_annotation_file)
#tcga_fbi_hlamp_proportions <- fread(tcga_fbi_hlamp_proportions_file)
```

```{r}
tcga_expr_df <- tcga_expr_mat %>% as.data.frame %>% column_to_rownames("tcga_sample_id") %>% t %>% as.data.frame %>% rownames_to_column("Name")
tcga_celltype_matrix <- create_pathway_matrix(tcga_expr_df, labels, db_path, convert_ids = FALSE)

tcga_immune <- rownames_to_column(as.data.frame(t(tcga_celltype_matrix)), var = "tcga_sample_id")
```

```{r, fig.width=8, fig.height=16}
mat <- tcga_immune %>% column_to_rownames("tcga_sample_id") %>% scale #%>% clip_values(2, -2)
row_annotations <- tcga_ov_annotation %>% as.data.frame %>% column_to_rownames("tcga_sample_id")

tcgaheat <- pheatmap(mat, annotation_row = row_annotations, cluster_rows = TRUE, cluster_cols = TRUE, clustering_method = "ward.D", show_rownames = FALSE)

#tcga_clusters <- rownames_to_column(data.frame(cluster=factor(cutree(tcgaheat$tree_row, 2))), var = "tcga_sample_id")

col <- "Cytotoxicity"
tcga_clusters <- rownames_to_column(data.frame(cluster=mat[,col] > median(mat[,col])), "tcga_sample_id") %>% mutate(cluster = as.factor(as.numeric(cluster)))

#row_annotations <- plyr::join(tcga_ov_annotation, tcga_clusters) %>% as.data.frame %>% column_to_rownames("tcga_sample_id")
#pheatmap(mat, annotation_row = row_annotations, cluster_rows = TRUE, cluster_cols = TRUE, clustering_method = "ward.D", show_rownames = FALSE)
```


```{r}
df_merged <- plyr::join(tcga_immune, tcga_ov_annotation)

expr_measure_vars <- colnames(tcga_immune)[2:ncol(tcga_immune)]

df_merged_melted <- melt(df_merged, id.vars = c("tcga_sample_id", "Subgroup", "MolecularSubtype", "BRCA.status"), measure.vars = expr_measure_vars, variable.name = "Measure", value.name = "Expression")
```

```{r, fig.height = 18}
pvals <- setNames(ddply(subset(df_merged_melted, !is.na(Subgroup)), .(Measure), function(x) {
  df <- as.data.frame(x)
  testres <- kruskal.test(Expression ~ factor(Subgroup), data = df)
  
  pval <- testres$p.value
  eq <- substitute(italic(P)==p, list(p=format(pval, digits=3)))
  return(as.character(as.expression(eq)))
}), c("Measure", "p.value"))

ggplot(subset(df_merged_melted, !is.na(Subgroup)), aes(x=Subgroup, y=Expression)) + geom_boxplot() + theme_bw() + facet_wrap(~ Measure, scales = "free", ncol=3) + theme_Publication() + geom_text(data=pvals, aes(x=Inf, y=Inf, label=p.value), hjust=1.1, vjust=1.5,size=3,parse=TRUE) 
```

P-values are uncorrected. So after correction, we aren't going to get anything significant. 

In conclusion, the immune subgroups constitute a new subgrouping independent of FBI-HLAMP. 

I've also done correlation testing using the logR values that Yikan's provided me (rather than the discrete groupings of no AMP, FBI-AMP low, and FBI-AMP high), but the correlations are very poor (rho values of ~ -0.05 or so, p-values of >=0.3). 

### Survival


```{r}
tcga_clinical <- read_tcga_clinical(tcga_donor_file, type = "synapse2", filter = TRUE, unique = FALSE)

tcga_clinical_labeled <- Reduce(function(x,y) plyr::join(x,y, type = 'full'), list(tcga_clusters, tcga_clinical, tcga_ov_annotation))

tcga_clinical_labeled$SurvObj <- with(tcga_clinical_labeled, Surv(ifelse(vital_status == "Dead", death_days_to, last_contact_days_to), vital_status == "Dead"))

#tcga_clinical_labeled <- subset(tcga_clinical_labeled, additional_drug_therapy == "NO" & additional_immuno_therapy == "NO" & additional_pharmaceutical_therapy == "NO" &  targeted_molecular_therapy == "NO" & radiation_therapy == "NO")
#tcga_clinical_labeled <- subset(tcga_clinical_labeled, str_detect(tumor_stage, "^III"))
```

#### FBI-AMP colocalization

Let's first stratify by Yikan's groupings:

```{r}
simple_survival_analysis(SurvObj ~ Subgroup, data = tcga_clinical_labeled)
```

Note: The logrank p-values in the paper are incorrect, the one's I've computed are the right ones. In the paper, the death days from the dead patients were not correctly combined into the computation, so p-values were computed without the dead patients. 

#### Immune activity

We'll now stratify by `cluster`, a variable that indicates whether or not we're above median in terms of `r col`. In other words, 1 means we're high immune, and 0 means we're low immune. 

```{r}
simple_survival_analysis(SurvObj ~ cluster, data = tcga_clinical_labeled)
```

Low immune tumours trend towards poor survival but this is not significant at all. 

Now let's try by treating immune activity as a continuous variable, controlling for other covariates:

```{r}
tcga_immune_continuous <-  tcga_celltype_matrix %>% t %>% as.data.frame %>% rownames_to_column("tcga_sample_id")

tcga_clinical_labeled_continuous <- plyr::join(tcga_clinical_labeled, tcga_immune_continuous)

coxph(SurvObj ~ Cytotoxicity + age_at_initial_pathologic_diagnosis + tumor_grade + str_extract(tumor_stage, "[IV]+") + additional_chemo_therapy + additional_drug_therapy + additional_immuno_therapy, tcga_clinical_labeled_continuous)
```

Indeed, immune activity is significantly associated with prolonged survival (negative coefficient = fewer death events). 

#### Stratification by foldback-AMP status

```{r}
fbi_high <- subset(tcga_clinical_labeled, Subgroup == "FBI-AMP High")
fbi_low <- subset(tcga_clinical_labeled, Subgroup == "FBI-AMP Low")
fbi_no <- subset(tcga_clinical_labeled, Subgroup == "No AMP")
fbi_nothigh <- subset(tcga_clinical_labeled, Subgroup %in% c("FBI-AMP Low", "No AMP"))
```

```{r}
simple_survival_analysis(SurvObj ~ cluster, data = fbi_high)
```

```{r}
simple_survival_analysis(SurvObj ~ cluster, data = fbi_low)
```

```{r}
simple_survival_analysis(SurvObj ~ cluster, data = fbi_no)
```

What's curious about this is that it seems only the no AMP group gets any benefit from having a high immune response. So there's no benefit within the foldback group of high/low immune response. 
```{r}
simple_survival_analysis(SurvObj ~ cluster, data = fbi_nothigh)
```

#### Stratification by immune activity

```{r}
immune_low <- subset(tcga_clinical_labeled, cluster == 0)
immune_high <- subset(tcga_clinical_labeled, cluster == 1)
```

```{r}
simple_survival_analysis(SurvObj ~ Subgroup, data = immune_high)

simple_survival_analysis(SurvObj ~ Subgroup, data = immune_low)
```

Likewise, the only benefit of being non-foldback is derived from the immune-high group -- there's no difference between being foldback or not if you're in the low immune group. 

#### Multivariate models

```{r}
simple_survival_analysis(SurvObj ~ Subgroup + cluster, data = tcga_clinical_labeled)
```

We'll next combined FBI-AMP low and No AMP into a single category called FBI-NotHigh. 

```{r}
tcga_clinical_labeled$newgroup <- ifelse(tcga_clinical_labeled$Subgroup == "FBI-AMP High", "FBI-High", "FBI-NotHigh")

simple_survival_analysis(SurvObj ~ newgroup + cluster, data = tcga_clinical_labeled)
```


Let's also do Cox proportional hazards models:

```{r}
mod1 <- coxph(SurvObj ~ Cytotoxicity + Subgroup + age_at_initial_pathologic_diagnosis + tumor_grade + str_extract(tumor_stage, "[IV]+") + additional_chemo_therapy + additional_drug_therapy + additional_immuno_therapy, tcga_clinical_labeled_continuous)
anova(mod1)
```

```{r}
mod2 <- coxph(SurvObj ~ Cytotoxicity + Subgroup + age_at_initial_pathologic_diagnosis, tcga_clinical_labeled_continuous)
anova(mod2)
```

```{r}
mod3 <- coxph(SurvObj ~ Cytotoxicity + (Subgroup == "FBI-AMP High") + age_at_initial_pathologic_diagnosis, subset(tcga_clinical_labeled_continuous, !is.na(Subgroup)))
anova(mod3)
```

So the effects of immune activity and FBI/HRD status are collinear. 

## Final run

Instead of overwriting the previous, this is its own section since the underlying implementation has changed substantially. 

```{r}
variant_table <- read_variant_file(master_variant_file, db_path)
breakpoint_table <- read_variant_file(master_breakpoint_file, db_path)

sig_results <- read_signature_files(mmctm_final_patient_dir, variant_table, breakpoint_table, db_path)

sig_results_snv <- subset(sig_results$snv, is_present == 1)
sig_results_sv <- subset(sig_results$sv, is_present == 1)
sig_results_nonith <- sig_results$nonith %>% as.data.frame %>% column_to_rownames("signature") %>% t %>% as.data.frame %>% rownames_to_column("condensed_id")

signature_labels <- str_extract(c(colnames(sig_results_snv), colnames(sig_results_sv)), "SN?V-[0-9]+")
signature_labels <- signature_labels[!is.na(signature_labels)]
```

```{r}
summarize_signature_proportions <- function(x, by=c("condensed_id", "patient_id"), signature_labels, report_count = FALSE) {
  props <- subset(x, select=c(by, colnames(x)[colnames(x) %in% signature_labels])) %>% group_by_(.dots=by) %>% summarise_each(funs(sum))
  
  if (report_count) {
    counts <- subset(x, select=c(by, colnames(x)[colnames(x) %in% signature_labels])) %>% group_by_(.dots=by) %>% summarise(n=n())
    props <- plyr::join(props %>% as.data.frame, counts %>% as.data.frame)
  }
  sums <- rowSums(subset(props, select=colnames(props[colnames(props) %in% signature_labels])))
  sigs <- intersect(signature_labels, colnames(props))
  for (sig in sigs) {
    props[,sig] <- props[,sig]/sums
  }
  return(props)
}
```

### Signatures

![](`r mmctm_final_patient_sigplot`)

### Sample signature proportions

These proportions won't be exactly equal to the topic proportions that the model outputs (they are variational estimates), but we're actually doing pretty darn well. 

TODO: Make a QC plot. From a glance it seems that proportion error is usually within 1-5% relative error. 

```{r}
sample_props_snv <- summarize_signature_proportions(sig_results_snv, by=c("condensed_id", "patient_id"), signature_labels)
sample_props_sv <- summarize_signature_proportions(sig_results_sv, by=c("condensed_id", "patient_id"), signature_labels)

patient_props_snv <- sig_results_snv %>% subset(select=c("patient_id", "chrom", "coord", "ref", "alt", intersect(signature_labels, colnames(sig_results_snv)))) %>% unique %>% summarize_signature_proportions(by=c("patient_id"), signature_labels)
patient_props_sv <- sig_results_sv %>% subset(select=c("patient_id", "prediction_id", intersect(signature_labels, colnames(sig_results_sv)))) %>% unique %>% summarize_signature_proportions(by=c("patient_id"), signature_labels)
```

```{r}
sample_props <- plyr::join(sample_props_snv %>% as.data.frame, sample_props_sv %>% as.data.frame)
sample_props <- rbind.fill(sample_props, sig_results_nonith)

sample_props_mat <- sample_props %>% column_to_rownames("condensed_id") %>% subset(select=c(colnames(sample_props)[colnames(sample_props) %in% signature_labels]))

sample_props_mat_scaled <- scale(sample_props_mat)
#[,c("SNV-2", "SNV-5", "SV-3", "SV-6", "SV-8", "SV-1", "SV-7", "SV-5", "SNV-3")]
sample_props_heat <- pheatmap(sample_props_mat_scaled, fontsize_row = 5,
                              clustering_method = "ward.D2")
```

```{r}
patient_props <- plyr::join(patient_props_snv %>% as.data.frame, patient_props_sv %>% as.data.frame)
patient_props <- rbind.fill(patient_props, sig_results_nonith %>% plyr::rename(c("condensed_id"="patient_id")))

patient_props_mat <- patient_props %>% column_to_rownames("patient_id") %>% subset(select=c(colnames(patient_props)[colnames(patient_props) %in% signature_labels]))

patient_props_mat_scaled <- scale(patient_props_mat)
#[,c("SNV-2", "SNV-5", "SV-3", "SV-6", "SV-8", "SV-1", "SV-7", "SV-5", "SNV-3")]
patient_props_heat <- pheatmap(patient_props_mat_scaled, fontsize_row = 5,
                              clustering_method = "ward.D2", clustering_distance_rows = "correlation")
```

#### ITH cohort

```{r}
sample_clusts <- as.data.frame(cutree(sample_props_heat$tree_row, 4)) %>% setNames(c("cluster")) %>% rownames_to_column("condensed_id")

ihc_stabilize_subset <- stabilize_ihc_variances(ihc_table_slide, ihc_table, tiltypes)

data_matrices <- list(sample_clusts, ihc_stabilize_subset, xcr_diversity, celltype_df, pathway_df, proportion_subclonality_subset)
data_matrices <- lapply(data_matrices, function(x) {
  x %>% dplyr::select(-one_of("patient_id"))
})

combined_df <- Reduce(function(x, y) plyr::join(x,y,type='full'), data_matrices)
combined_df <- subset(combined_df, !is.na(cluster) & !condensed_id %in% c("7_BrnM"))
combined_df <- subset(combined_df, condensed_id %in% subset(samples, project_code == "ITH")$condensed_id)

combined_mat <- subset(combined_df, select=-c(condensed_id, cluster))
rownames(combined_mat) <- combined_df$condensed_id
```

```{r, fig.width=8, fig.height=8}
#ref_dendrogram <- prune(as.dendrogram(sample_heat$tree_row), "7_BrnM")

sample_order <- sample_props_heat$tree_row$labels[sample_props_heat$tree_row$order]
sample_order <- intersect(sample_order, rownames(combined_mat))

cluster_annotations <- subset(combined_df, select=c(condensed_id, cluster))

SIGS <- c("SV-3", "SNV-5", "SV-6", "SV-8", "SNV-2")
sig_annotations <- rownames_to_column(data.frame(sample_props_mat[,SIGS], check.names = FALSE), var = "condensed_id")

## Take logarithms to ~variance stabilize
variant_count_annotations <- variant_sample %>% group_by_(.dots="condensed_id") %>% 
  summarise(snv_count=log(sum(snv_count)), bkpt_count=log(sum(bkpt_count))) %>%
  subset(select=c("condensed_id", "snv_count", "bkpt_count"))

row_annotations <- Reduce(plyr::join, list(cluster_annotations, molsubtypes, sig_annotations,
                                           variant_count_annotations))
row_annotations <- row_annotations %>% column_to_rownames(var = "condensed_id")
row_annotations_ith <- row_annotations

combined_mat_scaled <- scale(combined_mat)

pheatmap(clip_values(combined_mat_scaled, 2, -2)[sample_order,], cluster_rows = FALSE, cluster_cols = TRUE, annotation_row = row_annotations, fontsize = 6)

combined_mat_scaled_ith <- combined_mat_scaled
```

#### ICGC validation

```{r}
icgc_fbi_status <- fread(wang_icgc_fbi_status_file)
colnames(icgc_fbi_status) <- mapvalues(colnames(icgc_fbi_status), from = c("Case_ID"), to = c("condensed_id"))
```

```{r}
colnames(icgc_immune) <- mapvalues(colnames(icgc_immune), "new_id", "condensed_id")

data_matrices <- list(sample_clusts, icgc_immune)
data_matrices <- lapply(data_matrices, function(x) {
  x %>% dplyr::select(-one_of("patient_id"))
})

combined_df <- Reduce(function(x, y) plyr::join(x,y,type='inner'), data_matrices)
#combined_df <- subset(combined_df, cluster == 1)

combined_mat <- subset(combined_df, select=-c(condensed_id, cluster))
rownames(combined_mat) <- combined_df$condensed_id
```

```{r, fig.width=8, fig.height=8}
sample_order <- sample_props_heat$tree_row$labels[sample_props_heat$tree_row$order]
sample_order <- intersect(sample_order, rownames(combined_mat))

cluster_annotations <- subset(combined_df, select=c(condensed_id, cluster))

SIGS <- c("SV-3", "SNV-5", "SV-6", "SV-8", "SNV-2")
sig_annotations <- rownames_to_column(data.frame(sample_props_mat[,SIGS], check.names = FALSE), var = "condensed_id")

subtype_annotations <- subset(icgc_subtypes, select=c("icgc_donor_id", "subtype", "nmf_subtype"))
colnames(subtype_annotations)[1] <- "condensed_id"

icgc_fbi_annotations <- subset(icgc_fbi_status, select=c("condensed_id", "Patch et al. Class", "Patch et al. Molecular Subgroup", "BRCA.status", "Subgroup"))

row_annotations <- Reduce(plyr::join, list(cluster_annotations, sig_annotations, subtype_annotations, icgc_fbi_annotations))
row_annotations <- row_annotations %>% column_to_rownames(var = "condensed_id")
row_annotations_noith <- row_annotations

#select_rows <- rownames(subset(row_annotations, `SV-4` < 1))
notx <- subset(specimen_data, str_detect(specimen_type, "Primary") & specimen_donor_treatment_type == "no treatment")$icgc_donor_id
#sample_order <- intersect(sample_order, select_rows)
sample_order <- intersect(sample_order, notx)

#order_fbi <- order(row_annotations[rownames(combined_mat_scaled),]$`SV-8`)

#combined_mat_scaled <- scale(combined_mat)
combined_mat_scaled <- as.data.frame(apply(combined_mat, 2, function(x) (x-median(x,na.rm=TRUE))/mad(x,na.rm=TRUE)))

combined_mat_scaled <- combined_mat_scaled[sample_order,]
other_mat <- sample_props_mat_scaled[sample_order,]

pheatmap(clip_values(cbind(combined_mat_scaled, other_mat), 2, -2), cluster_rows = FALSE, cluster_cols = TRUE, annotation_row = row_annotations, fontsize = 6, clustering_distance_cols = "correlation", clustering_method = "ward.D2")

combined_mat_scaled_noith <- combined_mat_scaled
```


```{r}
icgc_clinical_labeled <- Reduce(function(x,y) plyr::join(x,y,type='full'), list(setNames(cluster_annotations, c("icgc_donor_id", "cluster")), icgc_clinical, setNames(subset(icgc_fbi_annotations, select=c("condensed_id", "BRCA.status", "Subgroup")), c("icgc_donor_id", "BRCA.status", "Wang_subgroup"))))

icgc_clinical_labeled$SurvObj <- with(icgc_clinical_labeled, Surv(donor_survival_time, donor_vital_status == "deceased"))
```

Survival by FBI status ...

```{r}
simple_survival_analysis(SurvObj ~ cluster != 1, data = subset(icgc_clinical_labeled, cluster != 4))
```

```{r}
simple_survival_analysis(SurvObj ~ cluster, data = subset(icgc_clinical_labeled, cluster != 4))
```

```{r}
simple_survival_analysis(SurvObj ~ Wang_subgroup, data = icgc_clinical_labeled)
```

#### Combined

##### Sample-level

```{r}
combined_mat_scaled_all <- rbind.fill(combined_mat_scaled_ith %>% as.data.frame %>% rownames_to_column("condensed_id"), combined_mat_scaled_noith %>% as.data.frame %>% rownames_to_column("condensed_id")) %>% column_to_rownames("condensed_id")

row_annotations_all <- rbind.fill(row_annotations_ith %>% rownames_to_column("condensed_id"), row_annotations_noith %>% rownames_to_column("condensed_id")) %>% column_to_rownames("condensed_id")

sample_order <- sample_props_heat$tree_row$labels[sample_props_heat$tree_row$order]
sample_order <- intersect(sample_order, rownames(combined_mat_scaled_all))

#combined_mat_scaled <- as.data.frame(apply(combined_mat_all, 2, function(x) (x-median(x,na.rm=TRUE))/mad(x,na.rm=TRUE)))
combined_mat_scaled <- combined_mat_scaled_all[sample_order,]

nanostring_vars <- rownames(icgc_celltype_matrix)
combined_mat_scaled_subset <- subset(combined_mat_scaled, select=nanostring_vars)
combined_mat_scaled_subset <- combined_mat_scaled_subset[!apply(combined_mat_scaled_subset, 1, function(x) all(is.na(x))),]
```

```{r, fig.width=7, fig.height=12}
pheatmap(clip_values(combined_mat_scaled_subset, 2, -2), cluster_rows = FALSE, cluster_cols = TRUE, annotation_row = row_annotations_all, fontsize = 6, clustering_distance_cols = "correlation", clustering_method = "ward.D2")
```

Doesn't really cluster consistently between ICGC and our samples. 

TODO: Batch correct ICGC and our data together (I think I already did this somewhere) -- and normalize the final matrix in one step rather than normalizing separately for each subcohort and combining those together. Otherwise we could always be subject to the scenario where the ITH cohort may be skewed in immune response (either all relatively low or high) compared to the ICGC cohort. 

TODO: Use absolute counts of mutations for each signature, and cluster based on those. It may be that samples with too low/high mutation counts are not clustering properly. 

##### Patient-level

```{r}
#ith_icgc_batch_corrected_expression_file <- "~/shahlab/projects/ITH_Immune/paper/results/tables/run2/ith_icgc_merged_bc.tsv"

ith_icgc_expression <- fread(ith_icgc_batch_corrected_expression_file)

ith_icgc_celltype_expr <- create_celltype_matrix(ith_icgc_expression, labels, db_path, convert_ids = FALSE)
ith_icgc_pathway_expr <- create_pathway_matrix(ith_icgc_expression, labels, db_path, convert_ids = FALSE)

icgc_specimen_data <- fread(icgc_specimen_file)
```

```{r}
summarize_expression_by_patient <- function(expr) {
  primary_specimen_dat <- subset(icgc_specimen_data, str_detect(specimen_type, "Primary"))
  
  notx <- subset(primary_specimen_dat, str_detect(specimen_type, "Primary") & specimen_donor_treatment_type == "no treatment")$icgc_donor_id
  primary_specimen_dat <- subset(primary_specimen_dat, icgc_donor_id %in% notx)
  
  x <- setNames(melt(as.matrix(expr)), c("Name", "sample", "expr"))
  x$patient_id <- map_id(as.character(x$sample), from = "condensed_id", to="patient_id", db_path)
  idx <- is.na(x$patient_id)
  donor_labels <- df_as_map(primary_specimen_dat, as.character(x$sample[idx]), from = "icgc_specimen_id",to = "icgc_donor_id")
  x$patient_id[idx] <- donor_labels
  x <- subset(x, !is.na(patient_id))
  x_sum <- x %>% group_by(Name, patient_id) %>% summarise(expr=mean(expr, na.rm=TRUE))
  
  res <- dcast(x_sum, formula = Name ~ patient_id, value.var = "expr")
  return(res)
}
```

```{r}
NCLUST <- 3
clusters <- cutree(patient_props_heat$tree_row, NCLUST)
patient_clusts <- make_cluster_frame(clusters)

cluster_annotations <- subset(plyr::rename(patient_clusts, c('new_id'='patient_id')), select=c(patient_id, cluster))

ith_icgc_celltype_expr_patient <- summarize_expression_by_patient(ith_icgc_celltype_expr)
ith_icgc_pathway_expr_patient <- summarize_expression_by_patient(ith_icgc_pathway_expr)

expr_dat <- ith_icgc_pathway_expr_patient
combined_patient_mat <- expr_dat %>% as.data.frame %>% column_to_rownames(var = "Name") %>% 
  t %>% as.data.frame %>% rownames_to_column("patient_id")

data_matrices <- list(cluster_annotations, combined_patient_mat)
data_matrices <- lapply(data_matrices, function(x) {
  x
})

combined_df <- Reduce(function(x, y) plyr::join(x,y,type='inner'), data_matrices)
combined_mat <- subset(combined_df, select=-c(cluster))
rownames(combined_mat) <- NULL
combined_mat <- combined_mat %>% column_to_rownames("patient_id")

SIGS <- c("SV-3", "SNV-5", "SV-6", "SV-8", "SNV-2")
sig_annotations <- rownames_to_column(data.frame(patient_props_mat[,SIGS], check.names = FALSE), var = "patient_id")

subtype_annotations <- subset(icgc_subtypes, select=c("icgc_donor_id", "subtype", "nmf_subtype"))
colnames(subtype_annotations)[1] <- "patient_id"

icgc_fbi_status_patient <- fread(wang_icgc_fbi_status_file)
colnames(icgc_fbi_status_patient) <- mapvalues(colnames(icgc_fbi_status_patient), from = c("Case_ID"), to = c("patient_id"))

icgc_fbi_annotations <- subset(icgc_fbi_status_patient, select=c("patient_id", "Patch et al. Class", "Patch et al. Molecular Subgroup", "BRCA.status", "Subgroup"))

row_annotations <- Reduce(plyr::join, list(cluster_annotations, sig_annotations, subtype_annotations, icgc_fbi_annotations))

row_annotations <- row_annotations %>% column_to_rownames("patient_id")

combined_patient_mat_scaled <- combined_mat %>% scale

patient_order <- patient_props_heat$tree_row$labels[patient_props_heat$tree_row$order]
patient_order <- intersect(patient_order, rownames(combined_patient_mat_scaled))
```

```{r, fig.width=7, fig.height=10}
pheatmap(clip_values(combined_patient_mat_scaled[patient_order,], 2, -2), cluster_rows = FALSE, cluster_cols = TRUE, annotation_row = row_annotations, fontsize = 6, clustering_distance_cols = "correlation", clustering_method = "ward.D2")
```



### Ancestral-descendant signature proportions

```{r}
ancdesc_props_snv <- unique(subset(sig_results_snv, select=c("patient_id", "is_ancestral", "chrom", "coord", "ref", "alt", signature_labels[signature_labels %in% colnames(sig_results_snv)]))) %>% summarize_signature_proportions(by=c("patient_id", "is_ancestral"), signature_labels)
ancdesc_props_sv <- unique(subset(sig_results_sv, select=c("patient_id", "prediction_id", "is_ancestral", signature_labels[signature_labels %in% colnames(sig_results_sv)]))) %>% summarize_signature_proportions(by=c("patient_id", "is_ancestral"), signature_labels)

ancdesc_props <- plyr::join(ancdesc_props_snv %>% as.data.frame, ancdesc_props_sv %>% as.data.frame)

ancdesc_props_mat <- subset(ancdesc_props, select=colnames(ancdesc_props)[colnames(ancdesc_props) %in% signature_labels])
```

```{r}
ancdesc_props_scaled <- scale(ancdesc_props_mat)
rownames(ancdesc_props_scaled) <- with(ancdesc_props, paste(patient_id, is_ancestral, sep="_"))

pheatmap(ancdesc_props_scaled, clustering_distance_rows = "euclidean", clustering_method = "ward.D2")
```

```{r}
ancdesc_props_melted <- melt(ancdesc_props, id.vars = c("patient_id", "is_ancestral"), measure.vars = intersect(colnames(ancdesc_props), signature_labels), variable.name = "signature", value.name = "proportion")

pvals <- setNames(ddply(ancdesc_props_melted, .(signature), function(x) {
  df <- as.data.frame(x)
  corres <- wilcox.test(proportion ~ is_ancestral, df, paired=TRUE)
  
  pval <- corres$p.value
  eq <- substitute(italic(P)==p, list(p=format(pval, digits=3)))
  return(as.character(as.expression(eq)))
}), c("signature", "p.value"))

ggplot(ancdesc_props_melted, aes(x=factor(is_ancestral), y=proportion)) + geom_boxplot() + facet_wrap(~ signature, scales="free") + geom_text(data=pvals, aes(x=Inf, y=Inf, label=p.value), hjust=1.1, vjust=1.5,size=3,parse=TRUE) + 
theme_bw() + theme_Publication()
```

### Origin node signature proportions

**Note**: We cannot get node-specific rearrangement signatures. 

**Note**: These node labels DO NOT correspond to node labels within the clone phylogeny; they correspond to nodes within the Dollo model. 

```{r}
node_props_snv <- unique(subset(sig_results_snv, select=c("patient_id", "origin_node", "chrom", "coord", "ref", "alt", signature_labels[signature_labels %in% colnames(sig_results_snv)]))) %>% summarize_signature_proportions(by=c("patient_id", "origin_node"), signature_labels, report_count = TRUE)
```
```{r}
node_props_mat <- subset(node_props_snv, select=colnames(node_props_snv)[colnames(node_props_snv) %in% signature_labels])
```

```{r, fig.width=7, fig.height=10}
node_props_scaled <- scale(node_props_mat)
rownames(node_props_scaled) <- with(node_props_snv, paste(patient_id, origin_node, sep="_"))

row_annotations <- subset(node_props_snv, select=c("patient_id", "n"))
row_annotations$n <- log2(row_annotations$n)
row_annotations$patient_id <- factor_id(row_annotations$patient_id, type = "patient_id", db_path)
rownames(row_annotations) <- rownames(node_props_scaled)

clustering_colours <- list(patient_id = pal_patient)

pheatmap(clip_values(node_props_scaled, 2, -2), clustering_distance_rows = "euclidean", clustering_method = "ward.D2", fontsize_row = 5, annotation_row = row_annotations, annotation_colors = clustering_colours)
```

Looks like there are similar selection pressures acting at each part of the sample phylogeny -- i.e. mutation signatures are 'relatively' consistent within patients throughout time. 

### Clonal phylogeny branch-specific signature proportions

NOTE: THE LABELS ON THESE TREES MAY NOT BE THE SAME AS THOSE IN THE MAPSCAPES (ugh). 

```{r}
tree_branch_data <- read_clone_tree_data(clone_tree_file, clone_branch_length_file, clone_prevalence_file, db_path)
trees <- lapply(tree_branch_data, function(x) x$tree)
branch_lengths <- rbind.fill(lapply(tree_branch_data, function(x) x$branch_dat))
```

```{r}
snv_cluster <- rbind.fill(lapply(seq_along(snv_cluster_files), function(i) {
  f <- snv_cluster_files[[i]]
  patient_id <- snv_cluster_patients[[i]]
  snv_cluster <- read_snv_cluster(f, clone_branch_length_file, db_path)
  return(snv_cluster)
}))

snv_cluster <- plyr::join(snv_cluster, branch_lengths)
```

```{r}
snv_cluster_filtered <- subset(snv_cluster, !is.na(label))

sig_results_snv_cluster <- plyr::join(sig_results_snv, snv_cluster_filtered)
```

```{r}
branch_props_snv <- unique(subset(sig_results_snv_cluster, select=c("patient_id", "label", "chrom", "coord", "ref", "alt", signature_labels[signature_labels %in% colnames(sig_results_snv_cluster)]))) %>% summarize_signature_proportions(by=c("patient_id", "label"), signature_labels, report_count = TRUE)
```

```{r}
branch_props_snv_melted <- melt(branch_props_snv, id.vars = c("patient_id", "label", "n"), measure.vars = intersect(signature_labels, colnames(branch_props_snv)), variable.name = "signature", value.name = "proportion")
```

```{r, fig.width=7, fig.height=25}
ggplot(branch_props_snv_melted, aes(x=label, y=proportion)) + geom_bar(aes(fill=signature), stat = "identity") + facet_wrap(~ patient_id, ncol=1, scales = "free_x") + theme_bw() + theme_Publication() + geom_text(data=unique(subset(branch_props_snv_melted, select=c("patient_id", "label", "n"))), aes(x=label, y=1, label=n),vjust=-.2,stat="identity") + ylim(c(0, 1.2))
```

What if we use MAP assignments? (This would allow us to apply chi-square tests.) 

TODO: Figure out how to apply tests between k > 2 groups of proportions... in other words, a test of homogeneity. 

```{r}
id_vars <- colnames(sig_results_snv_cluster)[!colnames(sig_results_snv_cluster) %in% signature_labels]

sig_results_snv_cluster_melted <- melt(sig_results_snv_cluster, id.vars = id_vars, measure.vars = intersect(signature_labels, colnames(sig_results_snv_cluster)), variable.name = "signature", value.name = "proportion")
maxes <- sig_results_snv_cluster_melted %>% group_by_(.dots = id_vars) %>% summarise(maxprop=max(proportion))
sig_results_snv_cluster_melted <- plyr::join(sig_results_snv_cluster_melted,maxes)
sig_results_snv_cluster_melted$proportion_map <- ifelse(sig_results_snv_cluster_melted$proportion == sig_results_snv_cluster_melted$maxprop, 1, 0)

sig_results_snv_cluster_casted <- dcast(sig_results_snv_cluster_melted, formula = paste0(paste(id_vars, collapse="+"), "~ signature"), value.var = "proportion_map")

branch_props_snv_map <- unique(subset(sig_results_snv_cluster_casted, select=c("patient_id", "label", "chrom", "coord", "ref", "alt", signature_labels[signature_labels %in% colnames(sig_results_snv_cluster_casted)]))) %>% summarize_signature_proportions(by=c("patient_id", "label"), signature_labels, report_count = TRUE)

branch_props_snv_map_melted <- melt(branch_props_snv_map, id.vars = c("patient_id", "label", "n"), measure.vars = intersect(signature_labels, colnames(branch_props_snv_map)), variable.name = "signature", value.name = "proportion")
```


```{r, fig.width=7, fig.height=25}
ggplot(branch_props_snv_map_melted, aes(x=label, y=proportion)) + geom_bar(aes(fill=signature), stat = "identity") + facet_wrap(~ patient_id, ncol=1, scales = "free_x") + theme_bw() + theme_Publication() + geom_text(data=unique(subset(branch_props_snv_map_melted, select=c("patient_id", "label", "n"))), aes(x=label, y=1, label=n),vjust=-.2,stat="identity") + ylim(c(0, 1.2))
```

### Adjusted clone trees

```{r}
trees_age <- trees
age_signature <- "SNV-5"

branch_props_dat <- branch_props_snv_map_melted

tree_objects <- lapply(seq_along(trees_age), function(i) {
  tree <- trees_age[[i]]
  tree_old <- tree
  patient <- names(trees_age)[i]
  branch_props_dat_sub <- subset(branch_props_dat, patient_id == as.numeric(patient) & signature == age_signature)
  branch_props_dat_sub$length <- with(branch_props_dat_sub, proportion*n)
  
  edge_lengths <- tree@edge.length
  idx <- which(edge_lengths == 0)
  to_labels <- tree@label[str_extract(names(edge_lengths), "[0-9]+$")]
  lengths <- df_as_map(branch_props_dat_sub, unname(to_labels), from = "label", to="length")
  if (length(idx) > 0) {
    lengths[idx] <- 0
  }
  tree@edge.length <- lengths
  names(tree@edge.length) <- names(edge_lengths)
  
  return(list(all=tree_old, age=tree))
})
names(tree_objects) <- names(trees_age)
```

```{r, fig.width=20, fig.height=20}
tmp <- unlist(tree_objects)
#ignore <- lapply(seq_along(tmp), function(i) {
#  print(plot(tmp[[i]], show.node.label = TRUE, main = names(tmp)[i]))
#})
```

```{r}
find_ancestors <- function(tree) {
  x <- phylobase:::.phylo4ToDataFrame(tree)
  root <- subset(x, node.type == "root")$label
  root_number <- names(which(tree@label == root))
  direct_descendants <- subset(x, ancestor == as.numeric(root_number))$label
  if (length(direct_descendants) == 1) {
    return(c(root, direct_descendants))
  } else {
    return(root)
  }
}
```

```{r}
patients <- unique(branch_props_dat$patient_id)
root_data <- rbind.fill(lapply(patients, function(pat) {
  tree <- trees[[as.character(pat)]]
  ancestors <- find_ancestors(tree)
  rbind.fill(lapply(ancestors, function(x) {
    data.frame(label=x, patient_id=pat)
  }))
}))
root_data$node_type <- "root"
```

```{r}
branch_data_annotated <- plyr::join(branch_props_dat, root_data)
branch_data_annotated$node_type[is.na(branch_data_annotated$node_type)] <- "descendant"

root_proportions <- branch_data_annotated %>% subset(node_type == "root") %>% group_by(patient_id, signature) %>% summarise(proportion=weighted.mean(proportion, w = n)) %>% plyr::rename(c('proportion'='root_proportion'))

branch_data_annotated <- plyr::join(branch_data_annotated, root_proportions)
branch_data_annotated$proportion_diff <- with(branch_data_annotated, proportion - root_proportion)
```

```{r}
ggplot(branch_data_annotated %>% subset(node_type == "descendant" & n > 40), aes(x=proportion_diff, fill=signature)) + geom_histogram(binwidth=0.02, alpha=0.4, position='identity') + theme_bw() + theme_Publication()
```