Setup

library(ithi.utils)
load_base_libs()

library(methods)

library(ithi.meta)
library(ithi.figures)
library(ithi.utils)
library(ithi.seq)
library(ithi.clones)
library(ithi.supp)
library(ithi.xcr)
ihc_table_path <- snakemake@input$ihc_table
xcr_table_path <- snakemake@input$xcr_table
neoediting_outdir <- snakemake@input$neoediting_outdir
snv_cluster_files <- snakemake@input$snv_cluster_files
clone_tree_file <- snakemake@input$clone_tree_file
clone_branch_length_file <- snakemake@input$clone_branch_length_file
clone_prevalence_file <- snakemake@input$clone_prevalence_file
tcr_diversity_file <- snakemake@input$tcr_diversity
bcr_diversity_file <- snakemake@input$bcr_diversity
remixt_ploidy_file <- snakemake@input$remixt_cellularity_ploidy_file
clonal_measures_file <- snakemake@input$ith_stats

db_path <- snakemake@params$db
ith_stat_types <- snakemake@params$ith_stat_types

xcr_diversity_measures <- as.vector(outer(c("tcr", "bcr"), c("clonotypes_unique", 
    "shannon_entropy", "inverse_simpson", "D50_index", "chao1"), function(x, 
    y) paste(x, y, sep = "_")))
annotation_colours <- ithi.figures::get_annotation_colours()

ihc_table <- fread(ihc_table_path)
xcr_table <- read_clonotypes(xcr_table_path, duplicates = FALSE, db_path = db_path)

Read 19.7% of 304822 rows
Read 62.3% of 304822 rows
Read 88.6% of 304822 rows
Read 304822 rows and 18 (of 18) columns from 0.070 GB file in 00:00:05
tree_branch_data <- read_clone_tree_data(clone_tree_file, clone_branch_length_file, 
    clone_prevalence_file, db_path)

neoediting_res <- supp_neoediting(neoediting_outdir, ihc_table, db_path, tree_branch_data, 
    wtfilter = TRUE, full_epitopes = FALSE, snv_cluster_files = snv_cluster_files)

xcr_diversity <- ithi.supp::get_xcr_diversity(tcr_diversity_file, bcr_diversity_file, 
    db_path, xcr_table)

remixt_ploidy <- read.table(remixt_ploidy_file, row.names = 1, header = TRUE, 
    stringsAsFactors = FALSE)

remixt_ploidy <- remixt_ploidy %>% rownames_to_column(var = "voa")
remixt_ploidy$patient_id <- ithi.meta::factor_id(remixt_ploidy$patient_id, type = "patient_id", 
    db_path)
remixt_ploidy$condensed_id <- ithi.meta::map_id(remixt_ploidy$voa, from = "voa", 
    to = "condensed_id", db_path)

clonal_measures <- read_ith_stats(clonal_measures_file, db_path, duplicates = FALSE)

Analysis

The purpose of this analysis is to show whether or not TCR repertoire diversity associates with subclonal neoepitope depletion. If so, I suppose the biological interpretation would be that a more diverse TCR repertoire would be better able to eliminate specific (subclonal) antigens.

There are 2 things we should use to address this line of thinking:

  • Show that TCR/BCR repertoire diversity is NOT associated with ITH
  • Show whether or not TCR repertoire diversity has anything to do with subclonal neoepitope depletion

The first point – at a high level – challenges the assumption that more diverse TCR repertoires may contend with antigenic diversity associated with subclonal diversification, while the latter point directly addresses the reviewer comment. To then explain how TCR diversity can be collinear with epithelial CD8+ TIL density but not be associated with ITH, I need to review the results of the TIL cluster vs. XCR diversity analysis. (TODO!)

XCR diversity-ITH association

We mention this in the paper, but to show a plot:

xcr_diversity_melted <- melt(xcr_diversity, id.vars = c("patient_id", "condensed_id"), 
    measure.vars = xcr_diversity_measures)

xcr_clonal_diversity <- merge(clonal_measures, xcr_diversity_melted)
for (stat_type in ith_stat_types) {
    pvals <- ithi.utils::compute_pvals_subsets(xcr_clonal_diversity, facet_vars = c("variable"), 
        formula = as.formula(paste0("~ ", stat_type, "+ value")), corfun = cor.test, 
        method = "spearman")
    
    p <- ggplot(xcr_clonal_diversity, aes_string(x = stat_type, y = "value")) + 
        geom_point(aes(colour = patient_id)) + theme_bw() + theme_Publication() + 
        theme_nature() + scale_colour_manual(values = annotation_colours$patient_id) + 
        xlab(stat_type) + ylab("XCR diversity") + facet_wrap(~variable, scales = "free") + 
        geom_text(data = pvals, aes(x = Inf, y = Inf, label = p.value.text), 
            hjust = 1.1, vjust = 1.5, size = 2.5, parse = TRUE)
    
    print(p)
}

Even when working with uncorrected p-values, it’s quite obvious that there’s no correlation between ITH and XCR diversity.

Subclonal neoepitope depletion

rates_xcr_analysis <- function(rates_unmerged, xcr_diversity, divcol = "tcr_clonotypes_unique") {
    rates_unmerged_xcr <- plyr::join(rates_unmerged, xcr_diversity)
    rates_unmerged_xcr$patient_id <- ithi.meta::map_id(rates_unmerged_xcr$condensed_id, 
        from = "condensed_id", to = "patient_id", db_path)
    div_cols <- c("tcr_clonotypes_unique", "tcr_shannon_entropy", "tcr_inverse_simpson", 
        "bcr_clonotypes_unique", "bcr_shannon_entropy", "bcr_inverse_simpson")
    
    for (col in div_cols) {
        rates_unmerged_xcr[, col] <- rates_unmerged_xcr[, col]/max(rates_unmerged_xcr[, 
            col], na.rm = TRUE)
    }
    
    mod <- glmer(as.formula(paste0("expratio/obsratio ~ ", divcol, "+ (1 | patient_id)")), 
        data = rates_unmerged_xcr[!is.na(rates_unmerged_xcr[, divcol]), ], family = Gamma(link = "log"))
    summod <- summary(mod)
    pval <- unname(summod$coefficients[, 4][2])
    
    rates_unmerged_xcr$ei <- with(rates_unmerged_xcr, expratio/obsratio)
    
    labeller_vector <- c(paste("Patient", unique(rates_unmerged_xcr$patient_id)))
    names(labeller_vector) <- as.character(unique(rates_unmerged_xcr$patient_id))
    
    p1 <- ggplot(rates_unmerged_xcr, aes_string(x = divcol, y = "ei")) + theme_bw() + 
        ithi.utils::theme_Publication() + ithi.utils::theme_nature() + ithi.utils::stripped_theme() + 
        ylab("Expected/observed neoantigen ratio") + geom_point() + scale_color_manual(values = annotation_colours$patient_id) + 
        facet_wrap(~patient_id, scales = "free", labeller = as_labeller(labeller_vector)) + 
        xlab(divcol) + guides(colour = guide_legend(title = "Patient", nrow = 2)) + 
        ithi.utils::ggmargins(type = "topminus")
    
    return(list(plot = p1, pval = pval))
}
subclonal_rates_unmerged <- neoediting_res$subclonal_rates_unmerged %>% plyr::rename(c(sample_key = "condensed_id"))

tcr_uniqclone_neoediting_subclonal_res <- rates_xcr_analysis(subclonal_rates_unmerged, 
    xcr_diversity, divcol = "tcr_clonotypes_unique")
bcr_uniqclone_neoediting_subclonal_res <- rates_xcr_analysis(subclonal_rates_unmerged, 
    xcr_diversity, divcol = "bcr_clonotypes_unique")

tcr_shannon_neoediting_subclonal_res <- rates_xcr_analysis(subclonal_rates_unmerged, 
    xcr_diversity, divcol = "tcr_shannon_entropy")
bcr_shannon_neoediting_subclonal_res <- rates_xcr_analysis(subclonal_rates_unmerged, 
    xcr_diversity, divcol = "bcr_shannon_entropy")

tcr_simpson_neoediting_subclonal_res <- rates_xcr_analysis(subclonal_rates_unmerged, 
    xcr_diversity, divcol = "tcr_inverse_simpson")
bcr_simpson_neoediting_subclonal_res <- rates_xcr_analysis(subclonal_rates_unmerged, 
    xcr_diversity, divcol = "bcr_inverse_simpson")

(These are for subclonal neoediting.)

clonal_rates_unmerged <- neoediting_res$clonal_rates_unmerged %>% plyr::rename(c(sample_key = "condensed_id"))

tcr_uniqclone_neoediting_clonal_res <- rates_xcr_analysis(clonal_rates_unmerged, 
    xcr_diversity, divcol = "tcr_clonotypes_unique")
bcr_uniqclone_neoediting_clonal_res <- rates_xcr_analysis(clonal_rates_unmerged, 
    xcr_diversity, divcol = "bcr_clonotypes_unique")

tcr_shannon_neoediting_clonal_res <- rates_xcr_analysis(clonal_rates_unmerged, 
    xcr_diversity, divcol = "tcr_shannon_entropy")
bcr_shannon_neoediting_clonal_res <- rates_xcr_analysis(clonal_rates_unmerged, 
    xcr_diversity, divcol = "bcr_shannon_entropy")

tcr_simpson_neoediting_clonal_res <- rates_xcr_analysis(clonal_rates_unmerged, 
    xcr_diversity, divcol = "tcr_inverse_simpson")
bcr_simpson_neoediting_clonal_res <- rates_xcr_analysis(clonal_rates_unmerged, 
    xcr_diversity, divcol = "bcr_inverse_simpson")

So, this is a bit interesting. While we don’t really see any robust results for clonal neoediting and TCR/BCR diversity, we do see that TCR, but not BCR diversity is (negatively) correlated with E_i. The plots aren’t super convincing:

tcr_shannon_neoediting_subclonal_res$plot

But this implies that perhaps less diverse – more expanded? – TCR populations are also a sign of immunoediting.

A natural question to ask is whether or not this is merely because TCR diversity and CD8+ TIL densities may be negatively correlated within patients. We can see whether that’s the case:

ihc_table$patient_id <- ithi.meta::factor_id(ihc_table$patient_id, type = "patient_id", 
    db_path)
ihc_xcr_div <- ihc_table %>% plyr::join(xcr_diversity)
ihc_xcr_div$E_CD8_rescaled <- ihc_xcr_div$E_CD8_density/max(ihc_xcr_div$E_CD8_density, 
    na.rm = TRUE)
ihc_xcr_div$tcr_shannon_entropy_rescaled <- ihc_xcr_div$tcr_shannon_entropy/max(ihc_xcr_div$tcr_shannon_entropy, 
    na.rm = TRUE)

dat <- subset(ihc_xcr_div, !is.na(E_CD8_density) & !is.na(tcr_shannon_entropy))
singleton_patients <- (dat %>% group_by(patient_id) %>% summarise(n = n()) %>% 
    subset(n == 1))$patient_id

glmer(as.formula(paste0("E_CD8_rescaled ~ ", "tcr_shannon_entropy_rescaled", 
    "+ (1 | patient_id)")), data = dat %>% subset(!patient_id %in% singleton_patients), 
    family = Gamma(link = "log")) %>% summary
Generalized linear mixed model fit by maximum likelihood (Laplace
  Approximation) [glmerMod]
 Family: Gamma  ( log )
Formula: E_CD8_rescaled ~ tcr_shannon_entropy_rescaled + (1 | patient_id)
   Data: dat %>% subset(!patient_id %in% singleton_patients)

     AIC      BIC   logLik deviance df.resid 
  -178.5   -169.5     93.3   -186.5       67 

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-1.2181 -0.6581 -0.1140  0.4331  2.7443 

Random effects:
 Groups     Name        Variance Std.Dev.
 patient_id (Intercept) 1.1756   1.084   
 Residual               0.6022   0.776   
Number of obs: 71, groups:  patient_id, 17

Fixed effects:
                             Estimate Std. Error t value Pr(>|z|)    
(Intercept)                   -2.5896     0.3663  -7.070 1.55e-12 ***
tcr_shannon_entropy_rescaled   0.4004     0.6402   0.626    0.532    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Correlation of Fixed Effects:
            (Intr)
tcr_shnnn__ -0.384

And there’s no correlation here. So it’s not true that, within patients, samples with higher epithelial CD8+ TIL densities necessarily have lower TCR diversity.

subclonal_rates_ihc_xcr_div <- plyr::join(subclonal_rates_unmerged, ihc_xcr_div)

mod <- glmer(expratio/obsratio ~ E_CD8_rescaled + tcr_shannon_entropy_rescaled + 
    (1 | patient_id), data = subset(subclonal_rates_ihc_xcr_div, !is.na(E_CD8_density) & 
    !is.na(tcr_shannon_entropy_rescaled)), family = Gamma(link = "log"))
summod <- summary(mod)
summod
Generalized linear mixed model fit by maximum likelihood (Laplace
  Approximation) [glmerMod]
 Family: Gamma  ( log )
Formula: 
expratio/obsratio ~ E_CD8_rescaled + tcr_shannon_entropy_rescaled +  
    (1 | patient_id)
   Data: 
subset(subclonal_rates_ihc_xcr_div, !is.na(E_CD8_density) & !is.na(tcr_shannon_entropy_rescaled))

     AIC      BIC   logLik deviance df.resid 
   -32.4    -23.5     21.2    -42.4       39 

Scaled residuals: 
     Min       1Q   Median       3Q      Max 
-1.88529 -0.53689 -0.09274  0.51294  1.35377 

Random effects:
 Groups     Name        Variance Std.Dev.
 patient_id (Intercept) 0.02579  0.1606  
 Residual               0.01519  0.1233  
Number of obs: 44, groups:  patient_id, 12

Fixed effects:
                             Estimate Std. Error t value Pr(>|z|)    
(Intercept)                    0.1487     0.1039   1.431 0.152303    
E_CD8_rescaled                 0.4192     0.1112   3.769 0.000164 ***
tcr_shannon_entropy_rescaled  -0.2673     0.1446  -1.849 0.064454 .  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Correlation of Fixed Effects:
            (Intr) E_CD8_
E_CD8_rscld -0.215       
tcr_shnnn__ -0.276  0.160

Note how the sign (of the TCR coefficient) changes.

Adding in ploidy …

subclonal_rates_ihc_xcr_div_ploidy <- plyr::join(subclonal_rates_ihc_xcr_div, 
    remixt_ploidy, by = c("condensed_id", "patient_id"))

mod <- glmer(expratio/obsratio ~ E_CD8_rescaled + tcr_shannon_entropy_rescaled + 
    psi + Cellularity + (1 | patient_id), data = subset(subclonal_rates_ihc_xcr_div_ploidy, 
    !is.na(E_CD8_density) & !is.na(tcr_shannon_entropy_rescaled)), family = Gamma(link = "log"))
summod <- summary(mod)
summod
Generalized linear mixed model fit by maximum likelihood (Laplace
  Approximation) [glmerMod]
 Family: Gamma  ( log )
Formula: 
expratio/obsratio ~ E_CD8_rescaled + tcr_shannon_entropy_rescaled +  
    psi + Cellularity + (1 | patient_id)
   Data: 
subset(subclonal_rates_ihc_xcr_div_ploidy, !is.na(E_CD8_density) &  
    !is.na(tcr_shannon_entropy_rescaled))

     AIC      BIC   logLik deviance df.resid 
   -51.7    -38.8     32.9    -65.7       40 

Scaled residuals: 
     Min       1Q   Median       3Q      Max 
-2.03133 -0.40851 -0.02773  0.52020  1.35785 

Random effects:
 Groups     Name        Variance Std.Dev.
 patient_id (Intercept) 0.02473  0.1573  
 Residual               0.01090  0.1044  
Number of obs: 47, groups:  patient_id, 12

Fixed effects:
                             Estimate Std. Error t value Pr(>|z|)    
(Intercept)                  -0.03317    0.14038  -0.236 0.813223    
E_CD8_rescaled                0.32928    0.09320   3.533 0.000411 ***
tcr_shannon_entropy_rescaled -0.13896    0.12053  -1.153 0.248942    
psi                          -0.01147    0.02750  -0.417 0.676676    
Cellularity                   0.37558    0.08830   4.253  2.1e-05 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Correlation of Fixed Effects:
            (Intr) E_CD8_ tcr___ psi   
E_CD8_rscld -0.019                     
tcr_shnnn__ -0.152  0.104              
psi         -0.539 -0.034 -0.171       
Cellularity -0.380 -0.244  0.221  0.020

So, putting everything together we still have epithelial CD8+ TIL density being significant, TCR diversity being a negatively associated factor at the edge of significance, while controlling for ploidy and cellularity.

In summary, TCR entropy alone isn’t significantly correlated with subclonal neoepitope elimination. When you add epithelial CD8+ TIL density to the explanatory variables, all of a sudden it is – but likely because it’s collinear with E_CD8_rescaled, given that the sign of the coefficient changes. Regularization could help here but no idea how to do it in glmer.

Some extensions might be to use other diversity indices (e.g. D50 or Berger-Parker indices).

---
title: "TCR-neoepitope correlation"
---
                        ```{r, echo=FALSE, message=FALSE, warning=FALSE}

######## Snakemake header ########
library(methods)
Snakemake <- setClass(
    "Snakemake",
    slots = c(
        input = "list",
        output = "list",
        params = "list",
        wildcards = "list",
        threads = "numeric",
        log = "list",
        resources = "list",
        config = "list",
        rule = "character"
    )
)
snakemake <- Snakemake(
    input = list('/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/ith_statistics.tsv', 'notebooks/tcr_neoepitope_correlation.Rmd', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/tree_data.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/clone_data.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/snv_cluster/patient_1.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/snv_cluster/patient_2.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/snv_cluster/patient_3.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/snv_cluster/patient_4.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/snv_cluster/patient_7.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/snv_cluster/patient_9.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/snv_cluster/patient_10.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/snv_cluster/patient_11.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/snv_cluster/patient_12.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/snv_cluster/patient_13.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/snv_cluster/patient_14.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/snv_cluster/patient_15.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/snv_cluster/patient_16.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/snv_cluster/patient_17.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/remixt_cellularity_ploidy.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/ihc_table.tsv', '/shahlab/alzhang/pipeline_outputs/ith_immune/neoediting/run6', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/xcr_table.tsv', '/shahlab/alzhang/pipeline_outputs/ith_immune/mixcr/mixcr_runs/ith_1_2_3/mixcr5/postprocess/IGH/postfilter_diversity_stats/diversity.strict.resampled.txt', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/branch_data.tsv', '/shahlab/alzhang/pipeline_outputs/ith_immune/mixcr/mixcr_runs/ith_1_2_3/mixcr5/postprocess/TRB/postfilter_diversity_stats/diversity.strict.resampled.txt', "ith_stats" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/ith_statistics.tsv', "notebook" = 'notebooks/tcr_neoepitope_correlation.Rmd', "clone_tree_file" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/tree_data.tsv', "clone_prevalence_file" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/clone_data.tsv', "snv_cluster_files" = c('/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/snv_cluster/patient_1.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/snv_cluster/patient_2.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/snv_cluster/patient_3.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/snv_cluster/patient_4.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/snv_cluster/patient_7.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/snv_cluster/patient_9.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/snv_cluster/patient_10.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/snv_cluster/patient_11.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/snv_cluster/patient_12.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/snv_cluster/patient_13.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/snv_cluster/patient_14.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/snv_cluster/patient_15.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/snv_cluster/patient_16.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/snv_cluster/patient_17.tsv'), "remixt_cellularity_ploidy_file" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/remixt_cellularity_ploidy.tsv', "ihc_table" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/ihc_table.tsv', "neoediting_outdir" = '/shahlab/alzhang/pipeline_outputs/ith_immune/neoediting/run6', "xcr_table" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/xcr_table.tsv', "bcr_diversity" = '/shahlab/alzhang/pipeline_outputs/ith_immune/mixcr/mixcr_runs/ith_1_2_3/mixcr5/postprocess/IGH/postfilter_diversity_stats/diversity.strict.resampled.txt', "clone_branch_length_file" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/branch_data.tsv', "tcr_diversity" = '/shahlab/alzhang/pipeline_outputs/ith_immune/mixcr/mixcr_runs/ith_1_2_3/mixcr5/postprocess/TRB/postfilter_diversity_stats/diversity.strict.resampled.txt'),
    output = list('/shahlab/alzhang/projects/ITH_Immune/paper/results/review/notebooks/run2/tcr_neoepitope_correlation.nb.html'),
    params = list('/shahlab/alzhang/projects/ITH_Immune/metadata/db/immune_project.sqlite3', 'tcr_neoepitope_correlation_analysis', "db" = '/shahlab/alzhang/projects/ITH_Immune/metadata/db/immune_project.sqlite3', "name" = 'tcr_neoepitope_correlation_analysis'),
    wildcards = list(),
    threads = 1,
    log = list('/shahlab/alzhang/clusttmp/paperreview2/notebooks/tcr_neoepitope_correlation_analysis.log'),
    resources = list(),
    config = list("nanostring_annotations" = '/shahlab/alzhang/projects/ITH_Immune/data/expression/nanostring/pancancer_annotations.tsv', "clola_result_file" = '/shahlab/alzhang/pipeline_outputs/ith_immune/clola/run4/clola_condensed_results/beta/clola_results.tsv', "clone_prevalences" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/clone_data.tsv', "somatic_coding_result_dir" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/somatic_coding_variants', "refseq_gene_file" = '/shahlab/alzhang/data/genome/hg19/refseq_genes.bed', "tilcluster_supervised_ipynb" = '/shahlab/alzhang/projects/ITH_Immune/paper/review/ipy/tilcluster_supervisedmulticlass.ipynb', "icgc_specimen" = '/shahlab/alzhang/data/ICGC/specimen.tsv', "nanostring_data" = '/shahlab/alzhang/projects/ITH_Immune/results/nanostring_results/ith_full/qc/limma_quantile/normalized_expression_voa_labels_filtered.tsv', "ith_icgc_bc" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/ith_icgc_merged_bc.tsv', "ihc_table" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/ihc_table.tsv', "neoediting_outdir" = '/shahlab/alzhang/pipeline_outputs/ith_immune/neoediting/run6', "tumour_purity" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/tumour_purity.tsv', "icgc_subtypes" = '/shahlab/alzhang/data/ICGC/icgc_primary_tumour_subtypes.tsv', "ith_stats" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/ith_statistics.tsv', "rooney_mutsigcv_file" = '/shahlab/alzhang/projects/ITH_Immune/external/other_papers/mmc6.xlsx', "snv_table" = '/shahlab/amcpherson/projects/ith3/ith3/notebooks/bespoke/ith_snvs.tsv', "notebook_dir" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/review/notebooks/run2', "variability_type" = 'stabilize', "igpartition_outdir" = '/shahlab/alzhang/pipeline_outputs/ith_immune/igpartition/run22', "table_dir" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/review/tables/run2', "patients_for_clonal" = c(1, 2, 3, 4, 7, 9, 10, 11, 12, 13, 14, 15, 16, 17), "ith_stat_types" = c('entropy', 'postprocessed_divergence', 'combined_ith_normalized', 'proportion_subclonal'), "finnhe_pipeline_results_dir" = '/shahlab/alzhang/pipeline_outputs/ith_immune/finnhe/run1', "known_subtypes_array" = '/shahlab/alzhang/projects/ITH_Immune/data/expression/array/subtypes/known_subtypes.tsv', "til_clusters_output" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/intermediates/run2/til_clusters_output.txt', "prevalence_threshold" = 0.01, "breakpoint_table" = '/shahlab/amcpherson/projects/ith3/ith3/notebooks/bespoke/ith_breakpoints.tsv', "epitopes_unique_filtered" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/epitopes_unique_filtered.tsv', "db" = '/shahlab/alzhang/projects/ITH_Immune/metadata/db/immune_project.sqlite3', "distance_method" = 'horn', "remixt_cellularity_ploidy" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/remixt_cellularity_ploidy.tsv', "logdir" = '/shahlab/alzhang/clusttmp/paperreview2', "image_summary" = '/shahlab/alzhang/data/ithi/yuan_hecr_image_results.csv', "bcr_diversity" = '/shahlab/alzhang/pipeline_outputs/ith_immune/mixcr/mixcr_runs/ith_1_2_3/mixcr5/postprocess/IGH/postfilter_diversity_stats/diversity.strict.resampled.txt', "total_tiltypes" = c('T_CD8_density', 'T_CD4_density', 'T_CD20_density', 'T_Plasma_density'), "tcr_diversity" = '/shahlab/alzhang/pipeline_outputs/ith_immune/mixcr/mixcr_runs/ith_1_2_3/mixcr5/postprocess/TRB/postfilter_diversity_stats/diversity.strict.resampled.txt', "copynumber_table" = '/shahlab/alzhang/data/ithi/master_copynumber_file.tsv', "snv_cluster_dir" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/snv_cluster', "tils_for_variability" = c('T_CD8_density', 'T_CD4_density', 'T_CD20_density', 'T_Plasma_density'), "all_tiltypes" = c('T_CD8_density', 'T_CD4_density', 'T_CD20_density', 'T_Plasma_density', 'E_CD8_density', 'E_CD4_density', 'E_CD20_density', 'E_Plasma_density', 'S_CD8_density', 'S_CD4_density', 'S_CD20_density', 'S_Plasma_density'), "mmctm_final_patient_dir" = '/shahlab/alzhang/projects/ITH_Immune/results/mmctm_results/ith_by-patient_with-ov', "tils_for_cluster" = c('E_CD8_density', 'E_CD4_density', 'E_CD20_density', 'E_Plasma_density', 'S_CD8_density', 'S_CD4_density', 'S_CD20_density', 'S_Plasma_density'), "array_expression_file" = '/shahlab/alzhang/projects/ITH_Immune/data/expression/array/gene_exprs_rma_batch_corrected.txt', "molsubtypes" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/molsubtypes.tsv', "ihc_features_output" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/intermediates/run2/ihc_features_output.txt', "clone_branch_lengths" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/branch_data.tsv', "xcr_table" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/xcr_table.tsv', "he_results_dir" = '/shahlab/alzhang/data/ithi/finn_results/he_output_Nov29', "clone_trees" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/tree_data.tsv', "benchmarkdir" = '/shahlab/alzhang/benchmarks/paperreview2', "image_summary2" = '/shahlab/alzhang/data/ithi/yuan_hecr_image_results_2.csv'),
    rule = 'tcr_neoepitope_correlation_analysis'
)
######## Original script #########

                        ```


## Setup

```{r global_chunk_options, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, tidy=TRUE, warning=FALSE, message=FALSE, cache=TRUE) #cache=TRUE
```

```{r}
library(ithi.utils)
load_base_libs()

library(methods)

library(ithi.meta)
library(ithi.figures)
library(ithi.utils)
library(ithi.seq)
library(ithi.clones)
library(ithi.supp)
library(ithi.xcr)
```

```{r}
ihc_table_path <- snakemake@input$ihc_table
xcr_table_path <- snakemake@input$xcr_table
neoediting_outdir <- snakemake@input$neoediting_outdir
snv_cluster_files <- snakemake@input$snv_cluster_files
clone_tree_file <- snakemake@input$clone_tree_file
clone_branch_length_file <- snakemake@input$clone_branch_length_file
clone_prevalence_file <- snakemake@input$clone_prevalence_file
tcr_diversity_file <- snakemake@input$tcr_diversity
bcr_diversity_file <- snakemake@input$bcr_diversity
remixt_ploidy_file <- snakemake@input$remixt_cellularity_ploidy_file
clonal_measures_file <- snakemake@input$ith_stats

db_path <- snakemake@params$db
ith_stat_types <- snakemake@params$ith_stat_types

xcr_diversity_measures <- as.vector(outer(c("tcr", "bcr"), c("clonotypes_unique", "shannon_entropy", "inverse_simpson", "D50_index", "chao1"),
                                function(x, y) paste(x, y, sep="_")))
```

```{r}
annotation_colours <- ithi.figures::get_annotation_colours()

ihc_table <- fread(ihc_table_path)
xcr_table <- read_clonotypes(xcr_table_path, duplicates = FALSE, db_path = db_path)

tree_branch_data <- read_clone_tree_data(clone_tree_file, clone_branch_length_file, clone_prevalence_file, db_path)

neoediting_res <- supp_neoediting(neoediting_outdir, ihc_table, db_path, tree_branch_data, wtfilter = TRUE, full_epitopes = FALSE, 
                                  snv_cluster_files = snv_cluster_files)

xcr_diversity <- ithi.supp::get_xcr_diversity(tcr_diversity_file, bcr_diversity_file, db_path, xcr_table)

remixt_ploidy <- read.table(remixt_ploidy_file, row.names = 1, header = TRUE, stringsAsFactors = FALSE)

remixt_ploidy <- remixt_ploidy %>% rownames_to_column(var = "voa")
remixt_ploidy$patient_id <- ithi.meta::factor_id(remixt_ploidy$patient_id, type = "patient_id", db_path)
remixt_ploidy$condensed_id <- ithi.meta::map_id(remixt_ploidy$voa, from = "voa", to = "condensed_id", db_path)

clonal_measures <- read_ith_stats(clonal_measures_file, db_path, duplicates = FALSE)
```


## Analysis

The purpose of this analysis is to show whether or not TCR repertoire diversity associates with subclonal neoepitope depletion. If so, I suppose the biological interpretation would be that a more diverse TCR repertoire would be better able to eliminate specific (subclonal) antigens. 

There are 2 things we should use to address this line of thinking:

* Show that TCR/BCR repertoire diversity is NOT associated with ITH
* Show whether or not TCR repertoire diversity has anything to do with subclonal neoepitope depletion

The first point -- at a high level -- challenges the assumption that more diverse TCR repertoires may contend with antigenic diversity associated with subclonal diversification, while the latter point directly addresses the reviewer comment. To then explain how TCR diversity can be collinear with epithelial CD8+ TIL density but not be associated with ITH, I need to review the results of the TIL cluster vs. XCR diversity analysis. (TODO!)

### XCR diversity-ITH association

We mention this in the paper, but to show a plot:

```{r}
xcr_diversity_melted <- melt(xcr_diversity, id.vars = c("patient_id", "condensed_id"), measure.vars = xcr_diversity_measures)

xcr_clonal_diversity <- merge(clonal_measures, xcr_diversity_melted)
```

```{r}
for (stat_type in ith_stat_types) {
  pvals <- ithi.utils::compute_pvals_subsets(xcr_clonal_diversity, facet_vars = c("variable"), formula = as.formula(paste0("~ ", stat_type, "+ value")), corfun = cor.test, method = "spearman")
  
  p <- ggplot(xcr_clonal_diversity, aes_string(x=stat_type, y="value")) + geom_point(aes(colour=patient_id)) + theme_bw() + theme_Publication() + 
    theme_nature() + scale_colour_manual(values = annotation_colours$patient_id) + xlab(stat_type) + ylab("XCR diversity") + 
    facet_wrap(~ variable, scales = "free") + geom_text(data=pvals, aes(x=Inf, y=Inf, label=p.value.text), hjust=1.1, vjust=1.5,size=2.5,parse=TRUE)
  
  print(p)
}
```

Even when working with uncorrected p-values, it's quite obvious that there's no correlation between ITH and XCR diversity. 

### Subclonal neoepitope depletion


```{r}
rates_xcr_analysis <- function(rates_unmerged, xcr_diversity, divcol = "tcr_clonotypes_unique") {
  rates_unmerged_xcr <- plyr::join(rates_unmerged, xcr_diversity)
  rates_unmerged_xcr$patient_id <- ithi.meta::map_id(rates_unmerged_xcr$condensed_id, from = "condensed_id", to = "patient_id", db_path)
  div_cols <- c("tcr_clonotypes_unique", "tcr_shannon_entropy", "tcr_inverse_simpson",
                "bcr_clonotypes_unique", "bcr_shannon_entropy", "bcr_inverse_simpson")
  
  for (col in div_cols) {
    rates_unmerged_xcr[,col] <- rates_unmerged_xcr[,col]/max(rates_unmerged_xcr[,col],na.rm=TRUE)
  }
  
  mod <- glmer(as.formula(paste0("expratio/obsratio ~ ", divcol, "+ (1 | patient_id)")), data=rates_unmerged_xcr[!is.na(rates_unmerged_xcr[,divcol]),], family = Gamma(link="log"))
  summod <- summary(mod)
  pval <- unname(summod$coefficients[,4][2])
  
  rates_unmerged_xcr$ei <- with(rates_unmerged_xcr, expratio/obsratio)
  
  labeller_vector <- c(paste("Patient", unique(rates_unmerged_xcr$patient_id)))
  names(labeller_vector) <- as.character(unique(rates_unmerged_xcr$patient_id))
  
  p1 <- ggplot(rates_unmerged_xcr, aes_string(x = divcol, y = "ei")) + theme_bw() + 
      ithi.utils::theme_Publication() + ithi.utils::theme_nature() + ithi.utils::stripped_theme() + ylab("Expected/observed neoantigen ratio") + geom_point() + 
      scale_color_manual(values = annotation_colours$patient_id) + facet_wrap(~patient_id, scales = "free", labeller = as_labeller(labeller_vector)) + 
      xlab(divcol) + guides(colour=guide_legend(title = "Patient", nrow = 2)) + ithi.utils::ggmargins(type = "topminus") 
  
  return(list(plot=p1, pval=pval))
}

```

```{r, cache=TRUE}
subclonal_rates_unmerged <- neoediting_res$subclonal_rates_unmerged %>% plyr::rename(c('sample_key'='condensed_id'))

tcr_uniqclone_neoediting_subclonal_res <- rates_xcr_analysis(subclonal_rates_unmerged, xcr_diversity, divcol = "tcr_clonotypes_unique")
bcr_uniqclone_neoediting_subclonal_res <- rates_xcr_analysis(subclonal_rates_unmerged, xcr_diversity, divcol = "bcr_clonotypes_unique")

tcr_shannon_neoediting_subclonal_res <- rates_xcr_analysis(subclonal_rates_unmerged, xcr_diversity, divcol = "tcr_shannon_entropy")
bcr_shannon_neoediting_subclonal_res <- rates_xcr_analysis(subclonal_rates_unmerged, xcr_diversity, divcol = "bcr_shannon_entropy")

tcr_simpson_neoediting_subclonal_res <- rates_xcr_analysis(subclonal_rates_unmerged, xcr_diversity, divcol = "tcr_inverse_simpson")
bcr_simpson_neoediting_subclonal_res <- rates_xcr_analysis(subclonal_rates_unmerged, xcr_diversity, divcol = "bcr_inverse_simpson")
```

(These are for subclonal neoediting.)

```{r, cache=TRUE}
clonal_rates_unmerged <- neoediting_res$clonal_rates_unmerged %>% plyr::rename(c('sample_key'='condensed_id'))

tcr_uniqclone_neoediting_clonal_res <- rates_xcr_analysis(clonal_rates_unmerged, xcr_diversity, divcol = "tcr_clonotypes_unique")
bcr_uniqclone_neoediting_clonal_res <- rates_xcr_analysis(clonal_rates_unmerged, xcr_diversity, divcol = "bcr_clonotypes_unique")

tcr_shannon_neoediting_clonal_res <- rates_xcr_analysis(clonal_rates_unmerged, xcr_diversity, divcol = "tcr_shannon_entropy")
bcr_shannon_neoediting_clonal_res <- rates_xcr_analysis(clonal_rates_unmerged, xcr_diversity, divcol = "bcr_shannon_entropy")

tcr_simpson_neoediting_clonal_res <- rates_xcr_analysis(clonal_rates_unmerged, xcr_diversity, divcol = "tcr_inverse_simpson")
bcr_simpson_neoediting_clonal_res <- rates_xcr_analysis(clonal_rates_unmerged, xcr_diversity, divcol = "bcr_inverse_simpson")
```

So, this is a bit interesting. While we don't really see any robust results for clonal neoediting and TCR/BCR diversity, we do see that TCR, but not BCR diversity is (negatively) correlated with E_i. The plots aren't super convincing:

```{r}
tcr_shannon_neoediting_subclonal_res$plot
```

But this implies that perhaps less diverse -- more expanded? -- TCR populations are also a sign of immunoediting. 

A natural question to ask is whether or not this is merely because TCR diversity and CD8+ TIL densities may be negatively correlated within patients. We can see whether that's the case:

```{r}
ihc_table$patient_id <- ithi.meta::factor_id(ihc_table$patient_id, type = "patient_id", db_path)
ihc_xcr_div <- ihc_table %>% plyr::join(xcr_diversity)
ihc_xcr_div$E_CD8_rescaled <- ihc_xcr_div$E_CD8_density/max(ihc_xcr_div$E_CD8_density,na.rm=TRUE)
ihc_xcr_div$tcr_shannon_entropy_rescaled <- ihc_xcr_div$tcr_shannon_entropy/max(ihc_xcr_div$tcr_shannon_entropy,na.rm=TRUE)

dat <- subset(ihc_xcr_div, !is.na(E_CD8_density) & !is.na(tcr_shannon_entropy))
singleton_patients <- (dat %>% group_by(patient_id) %>% summarise(n=n()) %>% subset(n == 1))$patient_id

glmer(as.formula(paste0("E_CD8_rescaled ~ ", "tcr_shannon_entropy_rescaled", "+ (1 | patient_id)")), data=dat %>% subset(!patient_id %in% singleton_patients), family = Gamma(link="log")) %>% summary
```

And there's no correlation here. So it's not true that, within patients, samples with higher epithelial CD8+ TIL densities necessarily have lower TCR diversity. 


```{r}
subclonal_rates_ihc_xcr_div <- plyr::join(subclonal_rates_unmerged, ihc_xcr_div)

mod <- glmer(expratio/obsratio ~ E_CD8_rescaled + tcr_shannon_entropy_rescaled + (1 | patient_id), data=subset(subclonal_rates_ihc_xcr_div, !is.na(E_CD8_density) & !is.na(tcr_shannon_entropy_rescaled)), family = Gamma(link="log"))
summod <- summary(mod)
summod
```

Note how the sign (of the TCR coefficient) changes. 

Adding in ploidy ...

```{r}
subclonal_rates_ihc_xcr_div_ploidy <- plyr::join(subclonal_rates_ihc_xcr_div, remixt_ploidy, by = c("condensed_id", "patient_id")) 

mod <- glmer(expratio/obsratio ~ E_CD8_rescaled + tcr_shannon_entropy_rescaled + psi + Cellularity + (1 | patient_id), data=subset(subclonal_rates_ihc_xcr_div_ploidy, !is.na(E_CD8_density) & !is.na(tcr_shannon_entropy_rescaled)), family = Gamma(link="log"))
summod <- summary(mod)
summod
```

So, putting everything together we still have epithelial CD8+ TIL density being significant, TCR diversity being a negatively associated factor at the edge of significance, while controlling for ploidy and cellularity. 

In summary, TCR entropy alone isn't significantly correlated with subclonal neoepitope elimination. When you add epithelial CD8+ TIL density to the explanatory variables, all of a sudden it is -- but likely because it's collinear with E_CD8_rescaled, given that the sign of the coefficient changes. Regularization could help here but no idea how to do it in glmer. 

Some extensions might be to use other diversity indices (e.g. D50 or Berger-Parker indices).
