library(ithi.utils)
load_base_libs()
library(methods)
library(factoextra)
library(ithi.meta)
library(ithi.figures)
library(ithi.utils)
library(ithi.seq)
library(ithi.clones)
library(ithi.supp)
ihc_table_path <- snakemake@input$ihc_table
neoediting_outdir <- snakemake@input$neoediting_outdir
snv_cluster_files <- snakemake@input$snv_cluster_files
clone_tree_file <- snakemake@input$clone_tree_file
clone_branch_length_file <- snakemake@input$clone_branch_length_file
clone_prevalence_file <- snakemake@input$clone_prevalence_file
remixt_ploidy_file <- snakemake@input$remixt_cellularity_ploidy_file
master_variant_file <- snakemake@input$snv_table
db_path <- snakemake@params$db
annotation_colours <- ithi.figures::get_annotation_colours()
ihc_table <- fread(ihc_table_path)
master_variant_table <- read_variant_file(master_variant_file, db_path)
tree_branch_data <- read_clone_tree_data(clone_tree_file, clone_branch_length_file,
clone_prevalence_file, db_path)
neoediting_res <- supp_neoediting(neoediting_outdir, ihc_table, db_path, tree_branch_data,
wtfilter = TRUE, full_epitopes = FALSE, snv_cluster_files = snv_cluster_files)
We need to answer 2 questions:
To examine this, we’ll consider ploidy. Under WGD, the ploidy of a tumour sample should be approximately 4.
remixt_ploidy <- read.table(remixt_ploidy_file, row.names = 1, header = TRUE,
stringsAsFactors = FALSE)
remixt_ploidy <- remixt_ploidy %>% rownames_to_column(var = "voa")
remixt_ploidy$patient_id <- ithi.meta::factor_id(remixt_ploidy$patient_id, type = "patient_id",
db_path)
remixt_ploidy$condensed_id <- ithi.meta::map_id(remixt_ploidy$voa, from = "voa",
to = "condensed_id", db_path)
We’ll address the first bullet point here.
Inclusion/exclusion criteria:
## This should be refactored into a package
get_annotated_snvs <- function(snv_cluster_files, tree_branch_data) {
snv_cluster_patients <- as.list(stringr::str_extract(snv_cluster_files,
"(?<=patient_)[0-9]+"))
snv_cluster <- plyr::rbind.fill(lapply(seq_along(snv_cluster_files), function(i) {
f <- snv_cluster_files[[i]]
patient_id <- snv_cluster_patients[[i]]
snv_cluster <- ithi.clones::read_snv_cluster(f, clone_branch_length_file,
db_path)
return(snv_cluster)
}))
trees <- lapply(tree_branch_data, function(x) x$tree)
branch_lengths <- plyr::rbind.fill(lapply(tree_branch_data, function(x) x$branch_dat))
snv_cluster <- plyr::join(snv_cluster, branch_lengths)
patients <- unique(unlist(snv_cluster_patients))
root_data <- plyr::rbind.fill(lapply(patients, function(pat) {
tree <- trees[[as.character(pat)]]
ancestors <- ithi.supp:::find_ancestors(tree)
plyr::rbind.fill(lapply(ancestors, function(x) {
data.frame(label = x, patient_id = pat)
}))
}))
root_data$node_type <- "root"
root_data$patient_id <- as.numeric(as.character(root_data$patient_id))
snv_cluster_annotated <- plyr::join(snv_cluster, root_data, type = "left") %>%
plyr::rename(c(chrom = "Chromosome", ref = "Reference", alt = "Variant",
coord = "Start"))
return(snv_cluster_annotated)
}
snvs_annotated <- get_annotated_snvs(snv_cluster_files, tree_branch_data)
snvs_annotated_filtered <- subset(snvs_annotated, !is.na(node))
snvs_annotated_filtered$node_type[is.na(snvs_annotated_filtered$node_type)] <- "subclonal"
master_variant_present <- subset(master_variant_table, is_present == 1)
snvs_annotated_filtered_renamed <- snvs_annotated_filtered %>% plyr::rename(c(Chromosome = "chrom",
Start = "coord", Reference = "ref", Variant = "alt"))
master_variant_annotated <- merge(master_variant_present, snvs_annotated_filtered_renamed,
by = c("patient_id", "chrom", "coord", "ref", "alt"))
Ok, now we can do the counting.
node_type_snv_count <- master_variant_annotated %>% group_by(patient_id, sample_key,
condensed_id, node_type) %>% summarise(num_snv = n())
subclonal_snv_proportions <- node_type_snv_count %>% group_by(patient_id, sample_key,
condensed_id) %>% summarise(snv_subclonal_prop = num_snv[node_type == "subclonal"]/sum(num_snv),
snv_subclonal_num = num_snv[node_type == "subclonal"])
subclonal_snv_proportions$patient_id <- ithi.meta::factor_id(subclonal_snv_proportions$patient_id,
type = "patient_id", db_path)
subclonal_snv_ploidy <- subclonal_snv_proportions %>% as.data.frame %>% plyr::join(remixt_ploidy %>%
as.data.frame)
subclonal_snv_ploidy_melted <- melt(subclonal_snv_ploidy, id.vars = c("patient_id",
"condensed_id", "psi", "Cellularity"), measure.vars = c("snv_subclonal_prop",
"snv_subclonal_num"), variable.name = "measure", value.name = "value")
pvals <- ithi.utils::compute_pvals_subsets(subclonal_snv_ploidy_melted, facet_vars = c("measure"),
formula = ~psi + value, corfun = cor.test, method = "spearman")
p <- ggplot(subclonal_snv_ploidy_melted, aes(x = psi, y = value)) + geom_point(aes(colour = patient_id)) +
theme_bw() + theme_Publication() + theme_nature() + scale_colour_manual(values = annotation_colours$patient_id) +
xlab("Ploidy") + ylab("Subclonal SNVs") + facet_wrap(~measure, scales = "free") +
geom_text(data = pvals, aes(x = Inf, y = Inf, label = p.value.text), hjust = 1.1,
vjust = 1.5, size = 2.5, parse = TRUE)
p
So no, we don’t see a higher proportion/number of subclonal SNVs in samples with higher ploidy. Given the amount of effort taken to incorporate copy number/ploidy in the clonality model, this is not entirely surprising.
Next, we’ll address the second bullet point.
subclonal_neoediting_ploidy <- plyr::join(remixt_ploidy, neoediting_res$subclonal_rates %>%
plyr::rename(c(sample_key = "condensed_id")), by = c("condensed_id", "patient_id"))
clonal_neoediting_ploidy <- plyr::join(remixt_ploidy, neoediting_res$clonal_rates %>%
plyr::rename(c(sample_key = "condensed_id")), by = c("condensed_id", "patient_id"))
subclonal_neoediting_ploidy$epitope_rate <- with(subclonal_neoediting_ploidy,
nepitopes/ntotal)
clonal_neoediting_ploidy$epitope_rate <- with(clonal_neoediting_ploidy, nepitopes/ntotal)
subclonal_neoediting_ploidy_melted <- melt(subclonal_neoediting_ploidy, id.vars = c("patient_id",
"condensed_id", "psi", "Cellularity"), measure.vars = c("nepitopes", "epitope_rate"),
variable.name = "measure", value.name = "value")
pvals <- ithi.utils::compute_pvals_subsets(subclonal_neoediting_ploidy_melted,
facet_vars = c("measure"), formula = ~psi + value, corfun = cor.test, method = "spearman")
p1 <- ggplot(subclonal_neoediting_ploidy_melted, aes(x = psi, y = value)) +
geom_point(aes(colour = patient_id)) + theme_bw() + theme_Publication() +
theme_nature() + scale_colour_manual(values = annotation_colours$patient_id) +
xlab("Ploidy") + ylab("Number of subclonal neoepitopes") + facet_wrap(~measure,
scales = "free") + geom_text(data = pvals, aes(x = Inf, y = Inf, label = p.value.text),
hjust = 1.1, vjust = 1.5, size = 2.5, parse = TRUE)
p1
Thus, higher ploidy is not significantly associated with a higher subclonal neoepitope load or proportion.
clonal_neoediting_ploidy_melted <- melt(clonal_neoediting_ploidy, id.vars = c("patient_id",
"condensed_id", "psi", "Cellularity"), measure.vars = c("nepitopes", "epitope_rate"),
variable.name = "measure", value.name = "value")
pvals <- ithi.utils::compute_pvals_subsets(clonal_neoediting_ploidy_melted,
facet_vars = c("measure"), formula = ~psi + value, corfun = cor.test, method = "spearman")
p2 <- ggplot(clonal_neoediting_ploidy_melted, aes(x = psi, y = value)) + geom_point(aes(colour = patient_id)) +
theme_bw() + theme_Publication() + theme_nature() + scale_colour_manual(values = annotation_colours$patient_id) +
xlab("Ploidy") + ylab("Number of subclonal neoepitopes") + facet_wrap(~measure,
scales = "free") + geom_text(data = pvals, aes(x = Inf, y = Inf, label = p.value.text),
hjust = 1.1, vjust = 1.5, size = 2.5, parse = TRUE)
p2
Just for fun, we did this for clonal neoepitopes too – same result.
What about subclonal obs/exp neoepitope rates (i.e. the immunoediting indices)?
corres <- with(subclonal_neoediting_ploidy, cor.test(psi, expratio/obsratio,
method = "spearman"))
eq <- substitute(italic(P) == p, list(p = format(corres$p.value, digits = 3)))
p1 <- ggplot(subclonal_neoediting_ploidy, aes(x = psi, y = expratio/obsratio)) +
geom_point(aes(colour = patient_id)) + theme_bw() + theme_Publication() +
theme_nature() + scale_colour_manual(values = annotation_colours$patient_id) +
xlab("Ploidy") + ylab("E_i (subclonal)") + annotate("text", x = Inf, y = Inf,
parse = TRUE, hjust = 1, vjust = 1, label = as.character(as.expression(eq)))
p1
corres <- with(clonal_neoediting_ploidy, cor.test(psi, expratio/obsratio, method = "spearman"))
eq <- substitute(italic(P) == p, list(p = format(corres$p.value, digits = 3)))
p2 <- ggplot(clonal_neoediting_ploidy, aes(x = psi, y = expratio/obsratio)) +
geom_point(aes(colour = patient_id)) + theme_bw() + theme_Publication() +
theme_nature() + scale_colour_manual(values = annotation_colours$patient_id) +
xlab("Ploidy") + ylab("E_i (clonal)") + annotate("text", x = Inf, y = Inf,
parse = TRUE, hjust = 1, vjust = 1, label = as.character(as.expression(eq)))
p2
Again, no significant correlations. Now, to hammer it home, let’s add ploidy and cellularity into the neoepitope elimination model.
The base model is:
mod <- glmer(expratio/obsratio ~ E_CD8_rescaled + (1 | patient_id), data = subset(subclonal_neoediting_ploidy,
!is.na(E_CD8_density)), family = Gamma(link = "log"))
summod <- summary(mod)
summod
Generalized linear mixed model fit by maximum likelihood (Laplace
Approximation) [glmerMod]
Family: Gamma ( log )
Formula: expratio/obsratio ~ E_CD8_rescaled + (1 | patient_id)
Data: subset(subclonal_neoediting_ploidy, !is.na(E_CD8_density))
AIC BIC logLik deviance df.resid
-42.3 -34.7 25.2 -50.3 46
Scaled residuals:
Min 1Q Median 3Q Max
-1.7552 -0.5698 -0.1755 0.5849 1.5509
Random effects:
Groups Name Variance Std.Dev.
patient_id (Intercept) 0.02454 0.1567
Residual 0.01552 0.1246
Number of obs: 50, groups: patient_id, 12
Fixed effects:
Estimate Std. Error t value Pr(>|z|)
(Intercept) 0.1079 0.1000 1.079 0.281
E_CD8_rescaled 0.4505 0.1140 3.953 7.72e-05 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Correlation of Fixed Effects:
(Intr)
E_CD8_rscld -0.198
# pval <- unname(summod$coefficients[,4][2])
Now adding ploidy and cellularity as fixed effects:
mod <- glmer(expratio/obsratio ~ E_CD8_rescaled + psi + Cellularity + (1 | patient_id),
data = subset(subclonal_neoediting_ploidy, !is.na(E_CD8_density)), family = Gamma(link = "log"))
summod <- summary(mod)
summod
Generalized linear mixed model fit by maximum likelihood (Laplace
Approximation) [glmerMod]
Family: Gamma ( log )
Formula: expratio/obsratio ~ E_CD8_rescaled + psi + Cellularity + (1 |
patient_id)
Data: subset(subclonal_neoediting_ploidy, !is.na(E_CD8_density))
AIC BIC logLik deviance df.resid
-45.3 -33.9 28.7 -57.3 44
Scaled residuals:
Min 1Q Median 3Q Max
-1.79148 -0.58635 -0.07179 0.62726 1.53538
Random effects:
Groups Name Variance Std.Dev.
patient_id (Intercept) 0.02424 0.1557
Residual 0.01365 0.1168
Number of obs: 50, groups: patient_id, 12
Fixed effects:
Estimate Std. Error t value Pr(>|z|)
(Intercept) 0.005927 0.147842 0.040 0.968020
E_CD8_rescaled 0.384110 0.108387 3.544 0.000394 ***
psi -0.010788 0.032369 -0.333 0.738924
Cellularity 0.252936 0.093255 2.712 0.006682 **
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Correlation of Fixed Effects:
(Intr) E_CD8_ psi
E_CD8_rscld -0.026
psi -0.651 -0.021
Cellularity -0.377 -0.240 0.074
# pval <- unname(summod$coefficients[,4][2])
The p-value = 3.94305310^{-4} for the epithelial CD8+ TIL density effect remains significant. Ploidy and cellularity are not significant effects. As a sanity check, we make sure that the beta (coefficient) for E_CD8_rescaled remains the same sign in both cases, which it does.