Setup

library(ithi.utils)
load_base_libs()

library(methods)
library(factoextra)

library(ithi.meta)
library(ithi.figures)
library(ithi.utils)
library(ithi.seq)
library(ithi.clones)
library(ithi.supp)
ihc_table_path <- snakemake@input$ihc_table
neoediting_outdir <- snakemake@input$neoediting_outdir
snv_cluster_files <- snakemake@input$snv_cluster_files
clone_tree_file <- snakemake@input$clone_tree_file
clone_branch_length_file <- snakemake@input$clone_branch_length_file
clone_prevalence_file <- snakemake@input$clone_prevalence_file
remixt_ploidy_file <- snakemake@input$remixt_cellularity_ploidy_file
master_variant_file <- snakemake@input$snv_table

db_path <- snakemake@params$db
annotation_colours <- ithi.figures::get_annotation_colours()
ihc_table <- fread(ihc_table_path)
master_variant_table <- read_variant_file(master_variant_file, db_path)

tree_branch_data <- read_clone_tree_data(clone_tree_file, clone_branch_length_file, 
    clone_prevalence_file, db_path)

neoediting_res <- supp_neoediting(neoediting_outdir, ihc_table, db_path, tree_branch_data, 
    wtfilter = TRUE, full_epitopes = FALSE, snv_cluster_files = snv_cluster_files)

Analysis

We need to answer 2 questions:

  • Do whole-genome duplicated (or high ploidy) samples have a higher number/proportion of subclonal mutations?
  • Do whole-genome duplicated (or high ploidy) samples have a higher number/proportion of subclonal neoepitope-generating mutations?

WGD

To examine this, we’ll consider ploidy. Under WGD, the ploidy of a tumour sample should be approximately 4.

remixt_ploidy <- read.table(remixt_ploidy_file, row.names = 1, header = TRUE, 
    stringsAsFactors = FALSE)

remixt_ploidy <- remixt_ploidy %>% rownames_to_column(var = "voa")
remixt_ploidy$patient_id <- ithi.meta::factor_id(remixt_ploidy$patient_id, type = "patient_id", 
    db_path)
remixt_ploidy$condensed_id <- ithi.meta::map_id(remixt_ploidy$voa, from = "voa", 
    to = "condensed_id", db_path)

Subclonal mutations

We’ll address the first bullet point here.

Inclusion/exclusion criteria:

  • When calling clonality/subclonality, we exclude mutations that cannot be unambiguously assigned to a clone in the clonal phylogeny (according to maximum likelihood, see STAR Methods).
## This should be refactored into a package
get_annotated_snvs <- function(snv_cluster_files, tree_branch_data) {
    snv_cluster_patients <- as.list(stringr::str_extract(snv_cluster_files, 
        "(?<=patient_)[0-9]+"))
    snv_cluster <- plyr::rbind.fill(lapply(seq_along(snv_cluster_files), function(i) {
        f <- snv_cluster_files[[i]]
        patient_id <- snv_cluster_patients[[i]]
        snv_cluster <- ithi.clones::read_snv_cluster(f, clone_branch_length_file, 
            db_path)
        return(snv_cluster)
    }))
    trees <- lapply(tree_branch_data, function(x) x$tree)
    branch_lengths <- plyr::rbind.fill(lapply(tree_branch_data, function(x) x$branch_dat))
    snv_cluster <- plyr::join(snv_cluster, branch_lengths)
    patients <- unique(unlist(snv_cluster_patients))
    root_data <- plyr::rbind.fill(lapply(patients, function(pat) {
        tree <- trees[[as.character(pat)]]
        ancestors <- ithi.supp:::find_ancestors(tree)
        plyr::rbind.fill(lapply(ancestors, function(x) {
            data.frame(label = x, patient_id = pat)
        }))
    }))
    root_data$node_type <- "root"
    root_data$patient_id <- as.numeric(as.character(root_data$patient_id))
    snv_cluster_annotated <- plyr::join(snv_cluster, root_data, type = "left") %>% 
        plyr::rename(c(chrom = "Chromosome", ref = "Reference", alt = "Variant", 
            coord = "Start"))
    return(snv_cluster_annotated)
}
snvs_annotated <- get_annotated_snvs(snv_cluster_files, tree_branch_data)
snvs_annotated_filtered <- subset(snvs_annotated, !is.na(node))
snvs_annotated_filtered$node_type[is.na(snvs_annotated_filtered$node_type)] <- "subclonal"
master_variant_present <- subset(master_variant_table, is_present == 1)

snvs_annotated_filtered_renamed <- snvs_annotated_filtered %>% plyr::rename(c(Chromosome = "chrom", 
    Start = "coord", Reference = "ref", Variant = "alt"))

master_variant_annotated <- merge(master_variant_present, snvs_annotated_filtered_renamed, 
    by = c("patient_id", "chrom", "coord", "ref", "alt"))

Ok, now we can do the counting.

node_type_snv_count <- master_variant_annotated %>% group_by(patient_id, sample_key, 
    condensed_id, node_type) %>% summarise(num_snv = n())

subclonal_snv_proportions <- node_type_snv_count %>% group_by(patient_id, sample_key, 
    condensed_id) %>% summarise(snv_subclonal_prop = num_snv[node_type == "subclonal"]/sum(num_snv), 
    snv_subclonal_num = num_snv[node_type == "subclonal"])
subclonal_snv_proportions$patient_id <- ithi.meta::factor_id(subclonal_snv_proportions$patient_id, 
    type = "patient_id", db_path)

subclonal_snv_ploidy <- subclonal_snv_proportions %>% as.data.frame %>% plyr::join(remixt_ploidy %>% 
    as.data.frame)
subclonal_snv_ploidy_melted <- melt(subclonal_snv_ploidy, id.vars = c("patient_id", 
    "condensed_id", "psi", "Cellularity"), measure.vars = c("snv_subclonal_prop", 
    "snv_subclonal_num"), variable.name = "measure", value.name = "value")

pvals <- ithi.utils::compute_pvals_subsets(subclonal_snv_ploidy_melted, facet_vars = c("measure"), 
    formula = ~psi + value, corfun = cor.test, method = "spearman")

p <- ggplot(subclonal_snv_ploidy_melted, aes(x = psi, y = value)) + geom_point(aes(colour = patient_id)) + 
    theme_bw() + theme_Publication() + theme_nature() + scale_colour_manual(values = annotation_colours$patient_id) + 
    xlab("Ploidy") + ylab("Subclonal SNVs") + facet_wrap(~measure, scales = "free") + 
    geom_text(data = pvals, aes(x = Inf, y = Inf, label = p.value.text), hjust = 1.1, 
        vjust = 1.5, size = 2.5, parse = TRUE)

p

So no, we don’t see a higher proportion/number of subclonal SNVs in samples with higher ploidy. Given the amount of effort taken to incorporate copy number/ploidy in the clonality model, this is not entirely surprising.

Subclonal neoepitope-generating mutations

Next, we’ll address the second bullet point.

subclonal_neoediting_ploidy <- plyr::join(remixt_ploidy, neoediting_res$subclonal_rates %>% 
    plyr::rename(c(sample_key = "condensed_id")), by = c("condensed_id", "patient_id"))
clonal_neoediting_ploidy <- plyr::join(remixt_ploidy, neoediting_res$clonal_rates %>% 
    plyr::rename(c(sample_key = "condensed_id")), by = c("condensed_id", "patient_id"))

subclonal_neoediting_ploidy$epitope_rate <- with(subclonal_neoediting_ploidy, 
    nepitopes/ntotal)
clonal_neoediting_ploidy$epitope_rate <- with(clonal_neoediting_ploidy, nepitopes/ntotal)
subclonal_neoediting_ploidy_melted <- melt(subclonal_neoediting_ploidy, id.vars = c("patient_id", 
    "condensed_id", "psi", "Cellularity"), measure.vars = c("nepitopes", "epitope_rate"), 
    variable.name = "measure", value.name = "value")

pvals <- ithi.utils::compute_pvals_subsets(subclonal_neoediting_ploidy_melted, 
    facet_vars = c("measure"), formula = ~psi + value, corfun = cor.test, method = "spearman")

p1 <- ggplot(subclonal_neoediting_ploidy_melted, aes(x = psi, y = value)) + 
    geom_point(aes(colour = patient_id)) + theme_bw() + theme_Publication() + 
    theme_nature() + scale_colour_manual(values = annotation_colours$patient_id) + 
    xlab("Ploidy") + ylab("Number of subclonal neoepitopes") + facet_wrap(~measure, 
    scales = "free") + geom_text(data = pvals, aes(x = Inf, y = Inf, label = p.value.text), 
    hjust = 1.1, vjust = 1.5, size = 2.5, parse = TRUE)

p1

Thus, higher ploidy is not significantly associated with a higher subclonal neoepitope load or proportion.

clonal_neoediting_ploidy_melted <- melt(clonal_neoediting_ploidy, id.vars = c("patient_id", 
    "condensed_id", "psi", "Cellularity"), measure.vars = c("nepitopes", "epitope_rate"), 
    variable.name = "measure", value.name = "value")

pvals <- ithi.utils::compute_pvals_subsets(clonal_neoediting_ploidy_melted, 
    facet_vars = c("measure"), formula = ~psi + value, corfun = cor.test, method = "spearman")

p2 <- ggplot(clonal_neoediting_ploidy_melted, aes(x = psi, y = value)) + geom_point(aes(colour = patient_id)) + 
    theme_bw() + theme_Publication() + theme_nature() + scale_colour_manual(values = annotation_colours$patient_id) + 
    xlab("Ploidy") + ylab("Number of subclonal neoepitopes") + facet_wrap(~measure, 
    scales = "free") + geom_text(data = pvals, aes(x = Inf, y = Inf, label = p.value.text), 
    hjust = 1.1, vjust = 1.5, size = 2.5, parse = TRUE)

p2

Just for fun, we did this for clonal neoepitopes too – same result.

What about subclonal obs/exp neoepitope rates (i.e. the immunoediting indices)?

corres <- with(subclonal_neoediting_ploidy, cor.test(psi, expratio/obsratio, 
    method = "spearman"))
eq <- substitute(italic(P) == p, list(p = format(corres$p.value, digits = 3)))

p1 <- ggplot(subclonal_neoediting_ploidy, aes(x = psi, y = expratio/obsratio)) + 
    geom_point(aes(colour = patient_id)) + theme_bw() + theme_Publication() + 
    theme_nature() + scale_colour_manual(values = annotation_colours$patient_id) + 
    xlab("Ploidy") + ylab("E_i (subclonal)") + annotate("text", x = Inf, y = Inf, 
    parse = TRUE, hjust = 1, vjust = 1, label = as.character(as.expression(eq)))

p1

corres <- with(clonal_neoediting_ploidy, cor.test(psi, expratio/obsratio, method = "spearman"))
eq <- substitute(italic(P) == p, list(p = format(corres$p.value, digits = 3)))

p2 <- ggplot(clonal_neoediting_ploidy, aes(x = psi, y = expratio/obsratio)) + 
    geom_point(aes(colour = patient_id)) + theme_bw() + theme_Publication() + 
    theme_nature() + scale_colour_manual(values = annotation_colours$patient_id) + 
    xlab("Ploidy") + ylab("E_i (clonal)") + annotate("text", x = Inf, y = Inf, 
    parse = TRUE, hjust = 1, vjust = 1, label = as.character(as.expression(eq)))

p2

Again, no significant correlations. Now, to hammer it home, let’s add ploidy and cellularity into the neoepitope elimination model.

The base model is:

mod <- glmer(expratio/obsratio ~ E_CD8_rescaled + (1 | patient_id), data = subset(subclonal_neoediting_ploidy, 
    !is.na(E_CD8_density)), family = Gamma(link = "log"))
summod <- summary(mod)
summod
Generalized linear mixed model fit by maximum likelihood (Laplace
  Approximation) [glmerMod]
 Family: Gamma  ( log )
Formula: expratio/obsratio ~ E_CD8_rescaled + (1 | patient_id)
   Data: subset(subclonal_neoediting_ploidy, !is.na(E_CD8_density))

     AIC      BIC   logLik deviance df.resid 
   -42.3    -34.7     25.2    -50.3       46 

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-1.7552 -0.5698 -0.1755  0.5849  1.5509 

Random effects:
 Groups     Name        Variance Std.Dev.
 patient_id (Intercept) 0.02454  0.1567  
 Residual               0.01552  0.1246  
Number of obs: 50, groups:  patient_id, 12

Fixed effects:
               Estimate Std. Error t value Pr(>|z|)    
(Intercept)      0.1079     0.1000   1.079    0.281    
E_CD8_rescaled   0.4505     0.1140   3.953 7.72e-05 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Correlation of Fixed Effects:
            (Intr)
E_CD8_rscld -0.198
# pval <- unname(summod$coefficients[,4][2])

Now adding ploidy and cellularity as fixed effects:

mod <- glmer(expratio/obsratio ~ E_CD8_rescaled + psi + Cellularity + (1 | patient_id), 
    data = subset(subclonal_neoediting_ploidy, !is.na(E_CD8_density)), family = Gamma(link = "log"))
summod <- summary(mod)
summod
Generalized linear mixed model fit by maximum likelihood (Laplace
  Approximation) [glmerMod]
 Family: Gamma  ( log )
Formula: expratio/obsratio ~ E_CD8_rescaled + psi + Cellularity + (1 |  
    patient_id)
   Data: subset(subclonal_neoediting_ploidy, !is.na(E_CD8_density))

     AIC      BIC   logLik deviance df.resid 
   -45.3    -33.9     28.7    -57.3       44 

Scaled residuals: 
     Min       1Q   Median       3Q      Max 
-1.79148 -0.58635 -0.07179  0.62726  1.53538 

Random effects:
 Groups     Name        Variance Std.Dev.
 patient_id (Intercept) 0.02424  0.1557  
 Residual               0.01365  0.1168  
Number of obs: 50, groups:  patient_id, 12

Fixed effects:
                Estimate Std. Error t value Pr(>|z|)    
(Intercept)     0.005927   0.147842   0.040 0.968020    
E_CD8_rescaled  0.384110   0.108387   3.544 0.000394 ***
psi            -0.010788   0.032369  -0.333 0.738924    
Cellularity     0.252936   0.093255   2.712 0.006682 ** 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Correlation of Fixed Effects:
            (Intr) E_CD8_ psi   
E_CD8_rscld -0.026              
psi         -0.651 -0.021       
Cellularity -0.377 -0.240  0.074
# pval <- unname(summod$coefficients[,4][2])

The p-value = 3.94305310^{-4} for the epithelial CD8+ TIL density effect remains significant. Ploidy and cellularity are not significant effects. As a sanity check, we make sure that the beta (coefficient) for E_CD8_rescaled remains the same sign in both cases, which it does.

---
title: "Whole genome duplication"
---
                        ```{r, echo=FALSE, message=FALSE, warning=FALSE}

######## Snakemake header ########
library(methods)
Snakemake <- setClass(
    "Snakemake",
    slots = c(
        input = "list",
        output = "list",
        params = "list",
        wildcards = "list",
        threads = "numeric",
        log = "list",
        resources = "list",
        config = "list",
        rule = "character"
    )
)
snakemake <- Snakemake(
    input = list('/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/tree_data.tsv', 'notebooks/whole_genome_duplication.Rmd', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/ihc_table.tsv', '/shahlab/amcpherson/projects/ith3/ith3/notebooks/bespoke/ith_snvs.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/snv_cluster/patient_1.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/snv_cluster/patient_2.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/snv_cluster/patient_3.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/snv_cluster/patient_4.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/snv_cluster/patient_7.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/snv_cluster/patient_9.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/snv_cluster/patient_10.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/snv_cluster/patient_11.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/snv_cluster/patient_12.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/snv_cluster/patient_13.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/snv_cluster/patient_14.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/snv_cluster/patient_15.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/snv_cluster/patient_16.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/snv_cluster/patient_17.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/branch_data.tsv', '/shahlab/alzhang/pipeline_outputs/ith_immune/neoediting/run6', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/clone_data.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/remixt_cellularity_ploidy.tsv', "clone_tree_file" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/tree_data.tsv', "notebook" = 'notebooks/whole_genome_duplication.Rmd', "ihc_table" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/ihc_table.tsv', "snv_table" = '/shahlab/amcpherson/projects/ith3/ith3/notebooks/bespoke/ith_snvs.tsv', "snv_cluster_files" = c('/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/snv_cluster/patient_1.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/snv_cluster/patient_2.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/snv_cluster/patient_3.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/snv_cluster/patient_4.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/snv_cluster/patient_7.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/snv_cluster/patient_9.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/snv_cluster/patient_10.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/snv_cluster/patient_11.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/snv_cluster/patient_12.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/snv_cluster/patient_13.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/snv_cluster/patient_14.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/snv_cluster/patient_15.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/snv_cluster/patient_16.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/snv_cluster/patient_17.tsv'), "clone_branch_length_file" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/branch_data.tsv', "clone_prevalence_file" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/clone_data.tsv', "neoediting_outdir" = '/shahlab/alzhang/pipeline_outputs/ith_immune/neoediting/run6', "remixt_cellularity_ploidy_file" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/remixt_cellularity_ploidy.tsv'),
    output = list('/shahlab/alzhang/projects/ITH_Immune/paper/results/review/notebooks/run2/whole_genome_duplication.nb.html'),
    params = list('/shahlab/alzhang/projects/ITH_Immune/metadata/db/immune_project.sqlite3', 'wgd_analysis', "db" = '/shahlab/alzhang/projects/ITH_Immune/metadata/db/immune_project.sqlite3', "name" = 'wgd_analysis'),
    wildcards = list(),
    threads = 1,
    log = list('/shahlab/alzhang/clusttmp/paperreview2/notebooks/wgd_analysis.log'),
    resources = list(),
    config = list("ith_stats" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/ith_statistics.tsv', "refseq_gene_file" = '/shahlab/alzhang/data/genome/hg19/refseq_genes.bed', "xcr_table" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/xcr_table.tsv', "mmctm_final_patient_dir" = '/shahlab/alzhang/projects/ITH_Immune/results/mmctm_results/ith_by-patient_with-ov', "nanostring_data" = '/shahlab/alzhang/projects/ITH_Immune/results/nanostring_results/ith_full/qc/limma_quantile/normalized_expression_voa_labels_filtered.tsv', "neoediting_outdir" = '/shahlab/alzhang/pipeline_outputs/ith_immune/neoediting/run6', "copynumber_table" = '/shahlab/alzhang/data/ithi/master_copynumber_file.tsv', "bcr_diversity" = '/shahlab/alzhang/pipeline_outputs/ith_immune/mixcr/mixcr_runs/ith_1_2_3/mixcr5/postprocess/IGH/postfilter_diversity_stats/diversity.strict.resampled.txt', "he_results_dir" = '/shahlab/alzhang/data/ithi/finn_results/he_output_Nov29', "array_expression_file" = '/shahlab/alzhang/projects/ITH_Immune/data/expression/array/gene_exprs_rma_batch_corrected.txt', "snv_cluster_dir" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/snv_cluster', "tils_for_cluster" = c('E_CD8_density', 'E_CD4_density', 'E_CD20_density', 'E_Plasma_density', 'S_CD8_density', 'S_CD4_density', 'S_CD20_density', 'S_Plasma_density'), "total_tiltypes" = c('T_CD8_density', 'T_CD4_density', 'T_CD20_density', 'T_Plasma_density'), "logdir" = '/shahlab/alzhang/clusttmp/paperreview2', "ith_icgc_bc" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/ith_icgc_merged_bc.tsv', "patients_for_clonal" = c(1, 2, 3, 4, 7, 9, 10, 11, 12, 13, 14, 15, 16, 17), "variability_type" = 'stabilize', "table_dir" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/review/tables/run2', "db" = '/shahlab/alzhang/projects/ITH_Immune/metadata/db/immune_project.sqlite3', "snv_table" = '/shahlab/amcpherson/projects/ith3/ith3/notebooks/bespoke/ith_snvs.tsv', "epitopes_unique_filtered" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/epitopes_unique_filtered.tsv', "benchmarkdir" = '/shahlab/alzhang/benchmarks/paperreview2', "tilcluster_supervised_ipynb" = '/shahlab/alzhang/projects/ITH_Immune/paper/review/ipy/tilcluster_supervisedmulticlass.ipynb', "image_summary" = '/shahlab/alzhang/data/ithi/yuan_hecr_image_results.csv', "ihc_features_output" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/intermediates/run2/ihc_features_output.txt', "clone_branch_lengths" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/branch_data.tsv', "distance_method" = 'horn', "clone_prevalences" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/clone_data.tsv', "til_clusters_output" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/intermediates/run2/til_clusters_output.txt', "tumour_purity" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/tumour_purity.tsv', "prevalence_threshold" = 0.01, "notebook_dir" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/review/notebooks/run2', "somatic_coding_result_dir" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/somatic_coding_variants', "image_summary2" = '/shahlab/alzhang/data/ithi/yuan_hecr_image_results_2.csv', "known_subtypes_array" = '/shahlab/alzhang/projects/ITH_Immune/data/expression/array/subtypes/known_subtypes.tsv', "ith_stat_types" = c('entropy', 'postprocessed_divergence', 'combined_ith_normalized', 'proportion_subclonal'), "nanostring_annotations" = '/shahlab/alzhang/projects/ITH_Immune/data/expression/nanostring/pancancer_annotations.tsv', "ihc_table" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/ihc_table.tsv', "igpartition_outdir" = '/shahlab/alzhang/pipeline_outputs/ith_immune/igpartition/run22', "all_tiltypes" = c('T_CD8_density', 'T_CD4_density', 'T_CD20_density', 'T_Plasma_density', 'E_CD8_density', 'E_CD4_density', 'E_CD20_density', 'E_Plasma_density', 'S_CD8_density', 'S_CD4_density', 'S_CD20_density', 'S_Plasma_density'), "remixt_cellularity_ploidy" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/remixt_cellularity_ploidy.tsv', "breakpoint_table" = '/shahlab/amcpherson/projects/ith3/ith3/notebooks/bespoke/ith_breakpoints.tsv', "clola_result_file" = '/shahlab/alzhang/pipeline_outputs/ith_immune/clola/run4/clola_condensed_results/beta/clola_results.tsv', "clone_trees" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/tree_data.tsv', "rooney_mutsigcv_file" = '/shahlab/alzhang/projects/ITH_Immune/external/other_papers/mmc6.xlsx', "molsubtypes" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/molsubtypes.tsv', "finnhe_pipeline_results_dir" = '/shahlab/alzhang/pipeline_outputs/ith_immune/finnhe/run1', "icgc_specimen" = '/shahlab/alzhang/data/ICGC/specimen.tsv', "icgc_subtypes" = '/shahlab/alzhang/data/ICGC/icgc_primary_tumour_subtypes.tsv', "tcr_diversity" = '/shahlab/alzhang/pipeline_outputs/ith_immune/mixcr/mixcr_runs/ith_1_2_3/mixcr5/postprocess/TRB/postfilter_diversity_stats/diversity.strict.resampled.txt', "tils_for_variability" = c('T_CD8_density', 'T_CD4_density', 'T_CD20_density', 'T_Plasma_density')),
    rule = 'wgd_analysis'
)
######## Original script #########

                        ```


## Setup

```{r global_chunk_options, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, tidy=TRUE, warning=FALSE, message=FALSE, cache=TRUE) #cache=TRUE
```

```{r}
library(ithi.utils)
load_base_libs()

library(methods)
library(factoextra)

library(ithi.meta)
library(ithi.figures)
library(ithi.utils)
library(ithi.seq)
library(ithi.clones)
library(ithi.supp)
```

```{r}
ihc_table_path <- snakemake@input$ihc_table
neoediting_outdir <- snakemake@input$neoediting_outdir
snv_cluster_files <- snakemake@input$snv_cluster_files
clone_tree_file <- snakemake@input$clone_tree_file
clone_branch_length_file <- snakemake@input$clone_branch_length_file
clone_prevalence_file <- snakemake@input$clone_prevalence_file
remixt_ploidy_file <- snakemake@input$remixt_cellularity_ploidy_file
master_variant_file <- snakemake@input$snv_table

db_path <- snakemake@params$db
```

```{r}
annotation_colours <- ithi.figures::get_annotation_colours()
ihc_table <- fread(ihc_table_path)
master_variant_table <- read_variant_file(master_variant_file, db_path)

tree_branch_data <- read_clone_tree_data(clone_tree_file, clone_branch_length_file, clone_prevalence_file, db_path)

neoediting_res <- supp_neoediting(neoediting_outdir, ihc_table, db_path, tree_branch_data, wtfilter = TRUE, full_epitopes = FALSE, 
                                  snv_cluster_files = snv_cluster_files)
```

## Analysis

We need to answer 2 questions:

* Do whole-genome duplicated (or high ploidy) samples have a higher number/proportion of subclonal mutations?
* Do whole-genome duplicated (or high ploidy) samples have a higher number/proportion of subclonal neoepitope-generating mutations?

### WGD

To examine this, we'll consider ploidy. Under WGD, the ploidy of a tumour sample should be approximately 4. 

```{r}
remixt_ploidy <- read.table(remixt_ploidy_file, row.names = 1, header = TRUE, stringsAsFactors = FALSE)

remixt_ploidy <- remixt_ploidy %>% rownames_to_column(var = "voa")
remixt_ploidy$patient_id <- ithi.meta::factor_id(remixt_ploidy$patient_id, type = "patient_id", db_path)
remixt_ploidy$condensed_id <- ithi.meta::map_id(remixt_ploidy$voa, from = "voa", to = "condensed_id", db_path)
```

#### Subclonal mutations

We'll address the first bullet point here.

Inclusion/exclusion criteria: 

* When calling clonality/subclonality, we exclude mutations that cannot be unambiguously assigned to a clone in the clonal phylogeny (according to maximum likelihood, see STAR Methods). 

```{r}
## This should be refactored into a package
get_annotated_snvs <- function(snv_cluster_files, tree_branch_data) {
  snv_cluster_patients <- as.list(stringr::str_extract(snv_cluster_files, 
                                                       "(?<=patient_)[0-9]+"))
  snv_cluster <- plyr::rbind.fill(lapply(seq_along(snv_cluster_files), 
                                         function(i) {
                                           f <- snv_cluster_files[[i]]
                                           patient_id <- snv_cluster_patients[[i]]
                                           snv_cluster <- ithi.clones::read_snv_cluster(f, clone_branch_length_file, 
                                                                                        db_path)
                                           return(snv_cluster)
                                         }))
  trees <- lapply(tree_branch_data, function(x) x$tree)
  branch_lengths <- plyr::rbind.fill(lapply(tree_branch_data, 
                                            function(x) x$branch_dat))
  snv_cluster <- plyr::join(snv_cluster, branch_lengths)
  patients <- unique(unlist(snv_cluster_patients))
  root_data <- plyr::rbind.fill(lapply(patients, function(pat) {
    tree <- trees[[as.character(pat)]]
    ancestors <- ithi.supp:::find_ancestors(tree)
    plyr::rbind.fill(lapply(ancestors, function(x) {
      data.frame(label = x, patient_id = pat)
    }))
  }))
  root_data$node_type <- "root"
  root_data$patient_id <- as.numeric(as.character(root_data$patient_id))
  snv_cluster_annotated <- plyr::join(snv_cluster, root_data, 
                                      type = "left") %>% plyr::rename(c(chrom = "Chromosome", 
                                                                        ref = "Reference", alt = "Variant", coord = "Start"))
  return(snv_cluster_annotated)
}
```

```{r}
snvs_annotated <- get_annotated_snvs(snv_cluster_files, tree_branch_data)
```

```{r}
snvs_annotated_filtered <- subset(snvs_annotated, !is.na(node))
snvs_annotated_filtered$node_type[is.na(snvs_annotated_filtered$node_type)] <- "subclonal"
```

```{r}
master_variant_present <- subset(master_variant_table, is_present == 1)

snvs_annotated_filtered_renamed <- snvs_annotated_filtered %>% plyr::rename(c('Chromosome'='chrom', 'Start'='coord', 'Reference'='ref', 'Variant'='alt'))

master_variant_annotated <- merge(master_variant_present, snvs_annotated_filtered_renamed, by = c("patient_id", "chrom", "coord", "ref", "alt"))
```

Ok, now we can do the counting. 

```{r}
node_type_snv_count <- master_variant_annotated %>% group_by(patient_id, sample_key, condensed_id, node_type) %>% summarise(num_snv=n())

subclonal_snv_proportions <- node_type_snv_count %>% group_by(patient_id, sample_key, condensed_id) %>% summarise(snv_subclonal_prop=num_snv[node_type == "subclonal"]/sum(num_snv), snv_subclonal_num = num_snv[node_type == "subclonal"])
```

```{r}
subclonal_snv_proportions$patient_id <- ithi.meta::factor_id(subclonal_snv_proportions$patient_id, type = "patient_id", db_path)

subclonal_snv_ploidy <- subclonal_snv_proportions %>% as.data.frame %>% plyr::join(remixt_ploidy %>% as.data.frame)
```

```{r}
subclonal_snv_ploidy_melted <- melt(subclonal_snv_ploidy, id.vars = c("patient_id", "condensed_id", "psi", "Cellularity"), measure.vars = c("snv_subclonal_prop", "snv_subclonal_num"), variable.name = "measure", value.name = "value")

pvals <- ithi.utils::compute_pvals_subsets(subclonal_snv_ploidy_melted, facet_vars = c("measure"), formula = ~ psi + value, corfun = cor.test, method = "spearman")

p <- ggplot(subclonal_snv_ploidy_melted, aes(x=psi, y=value)) + geom_point(aes(colour=patient_id)) + theme_bw() + theme_Publication() + 
  theme_nature() + scale_colour_manual(values = annotation_colours$patient_id) + xlab("Ploidy") + ylab("Subclonal SNVs") + 
  facet_wrap(~ measure, scales = "free") + geom_text(data=pvals, aes(x=Inf, y=Inf, label=p.value.text), hjust=1.1, vjust=1.5,size=2.5,parse=TRUE)

p
```

So no, we don't see a higher proportion/number of subclonal SNVs in samples with higher ploidy. Given the amount of effort taken to incorporate copy number/ploidy in the clonality model, this is not entirely surprising. 

#### Subclonal neoepitope-generating mutations

Next, we'll address the second bullet point. 

```{r}
subclonal_neoediting_ploidy <- plyr::join(remixt_ploidy, neoediting_res$subclonal_rates %>% plyr::rename(c('sample_key'='condensed_id')),
                                          by = c("condensed_id", "patient_id"))
clonal_neoediting_ploidy <- plyr::join(remixt_ploidy, neoediting_res$clonal_rates %>% plyr::rename(c('sample_key'='condensed_id')),
                                          by = c("condensed_id", "patient_id"))

subclonal_neoediting_ploidy$epitope_rate <- with(subclonal_neoediting_ploidy, nepitopes/ntotal)
clonal_neoediting_ploidy$epitope_rate <- with(clonal_neoediting_ploidy, nepitopes/ntotal)
```

```{r}
subclonal_neoediting_ploidy_melted <- melt(subclonal_neoediting_ploidy, id.vars = c("patient_id", "condensed_id", "psi", "Cellularity"),
                                           measure.vars = c("nepitopes", "epitope_rate"), variable.name = "measure", value.name = "value")

pvals <- ithi.utils::compute_pvals_subsets(subclonal_neoediting_ploidy_melted, facet_vars = c("measure"), formula = ~ psi + value, corfun = cor.test, method = "spearman")

p1 <- ggplot(subclonal_neoediting_ploidy_melted, aes(x=psi, y=value)) + geom_point(aes(colour=patient_id)) + theme_bw() + theme_Publication() + 
  theme_nature() + scale_colour_manual(values = annotation_colours$patient_id) + xlab("Ploidy") + ylab("Number of subclonal neoepitopes") +
  facet_wrap(~ measure, scales = "free") + geom_text(data=pvals, aes(x=Inf, y=Inf, label=p.value.text), hjust=1.1, vjust=1.5,size=2.5,parse=TRUE)

p1 
```

Thus, higher ploidy is not significantly associated with a higher subclonal neoepitope load or proportion. 

```{r}
clonal_neoediting_ploidy_melted <- melt(clonal_neoediting_ploidy, id.vars = c("patient_id", "condensed_id", "psi", "Cellularity"),
                                        measure.vars = c("nepitopes", "epitope_rate"), variable.name = "measure", value.name = "value")

pvals <- ithi.utils::compute_pvals_subsets(clonal_neoediting_ploidy_melted, facet_vars = c("measure"), formula = ~ psi + value, corfun = cor.test, method = "spearman")

p2 <- ggplot(clonal_neoediting_ploidy_melted, aes(x=psi, y=value)) + geom_point(aes(colour=patient_id)) + theme_bw() + theme_Publication() + 
  theme_nature() + scale_colour_manual(values = annotation_colours$patient_id) + xlab("Ploidy") + ylab("Number of subclonal neoepitopes") +
  facet_wrap(~ measure, scales = "free") + geom_text(data=pvals, aes(x=Inf, y=Inf, label=p.value.text), hjust=1.1, vjust=1.5,size=2.5,parse=TRUE)

p2
```

Just for fun, we did this for clonal neoepitopes too -- same result. 

What about subclonal obs/exp neoepitope rates (i.e. the immunoediting indices)? 

```{r}
corres <- with(subclonal_neoediting_ploidy, cor.test(psi, expratio/obsratio, method = "spearman"))
eq <- substitute(italic(P)==p, list(p=format(corres$p.value, digits = 3)))

p1 <- ggplot(subclonal_neoediting_ploidy, aes(x=psi, y=expratio/obsratio)) + geom_point(aes(colour=patient_id)) + theme_bw() + theme_Publication() + 
  theme_nature() + scale_colour_manual(values = annotation_colours$patient_id) + xlab("Ploidy") + ylab("E_i (subclonal)") +
  annotate('text', x = Inf, y = Inf, parse = TRUE, hjust = 1, vjust = 1, label = as.character(as.expression(eq)))

p1 
```

```{r}
corres <- with(clonal_neoediting_ploidy, cor.test(psi, expratio/obsratio, method = "spearman"))
eq <- substitute(italic(P)==p, list(p=format(corres$p.value, digits = 3)))

p2 <- ggplot(clonal_neoediting_ploidy, aes(x=psi, y=expratio/obsratio)) + geom_point(aes(colour=patient_id)) + theme_bw() + theme_Publication() + 
  theme_nature() + scale_colour_manual(values = annotation_colours$patient_id) + xlab("Ploidy") + ylab("E_i (clonal)") +
  annotate('text', x = Inf, y = Inf, parse = TRUE, hjust = 1, vjust = 1, label = as.character(as.expression(eq)))

p2 
```

Again, no significant correlations. Now, to hammer it home, let's add ploidy and cellularity into the neoepitope elimination model. 

The base model is:

```{r}
mod <- glmer(expratio/obsratio ~ E_CD8_rescaled + (1 | patient_id), data=subset(subclonal_neoediting_ploidy, !is.na(E_CD8_density)), family = Gamma(link="log"))
summod <- summary(mod)
summod
#pval <- unname(summod$coefficients[,4][2])
```

Now adding ploidy and cellularity as fixed effects:

```{r}
mod <- glmer(expratio/obsratio ~ E_CD8_rescaled + psi + Cellularity + (1 | patient_id), data=subset(subclonal_neoediting_ploidy, !is.na(E_CD8_density)), family = Gamma(link="log"))
summod <- summary(mod)
summod
#pval <- unname(summod$coefficients[,4][2])
```


The p-value = `r unname(summod$coefficients[,4][2])` for the epithelial CD8+ TIL density effect remains significant. Ploidy and cellularity are not significant effects. As a sanity check, we make sure that the beta (coefficient) for E_CD8_rescaled remains the same sign in both cases, which it does. 

