Setup

library(ithi.utils)
load_base_libs()
library(ithi.meta)
library(ithi.spatial)
library(ithi.external)
he_results_dir <- snakemake@input$he_results_dir
tumour_purity_file <- snakemake@input$tumour_purity_file
ihc_table_path <- snakemake@input$ihc_table
molecular_subtype_file <- snakemake@input$molsubtypes

image_summary_file <- snakemake@input$image_summary
image_summary_file2 <- snakemake@input$image_summary2
finnhe_pipeline_results_dir <- snakemake@input$finnhe_pipeline_results_dir

db_path <- snakemake@params$db
tils_for_cluster <- snakemake@params$tils_for_cluster

Analysis

These are the results from the H&E image classifier. We’re going to see if we can derive any features of interest to separate N/S/ES-TIL samples off of this.

Tumour/Stromal area

annotation_colours <- ithi.figures::get_annotation_colours()

hotspot_stats_file <- file.path(finnhe_pipeline_results_dir, "overlap_stats_combined.tsv")
detection_summary_file <- file.path(finnhe_pipeline_results_dir, "detection_summary_combined.tsv")
segmented_detection_summary_file <- file.path(finnhe_pipeline_results_dir, "segmented_detection_summary_combined.tsv")

tumour_purity <- fread(tumour_purity_file)
annotation_measurement_files <- list.files(he_results_dir, pattern = "_annotation_measurements.txt", 
    full.names = TRUE, recursive = FALSE)
detection_measurement_files <- list.files(he_results_dir, pattern = "_detection_measurements.txt", 
    full.names = TRUE, recursive = FALSE)

ihc_table <- fread(ihc_table_path)
molsubtypes <- fread(molecular_subtype_file)

hotspot_stats <- fread(hotspot_stats_file)

til_clusters <- ithi.figures:::get_til_clusters(ihc_table, molsubtypes, tils_for_cluster = tils_for_cluster, 
    nclusts = 3)
read_annotation_measurements <- function(annotation_measurement_file) {
    annotations <- data.table::fread(annotation_measurement_file)
    annotations <- annotations %>% plyr::rename(c(`Centroid X µm` = "centroid_x", 
        `Centroid Y µm` = "centroid_y", `Area µm^2` = "area", `Perimeter µm` = "perimeter"))
    annotations$area_proportion <- annotations$area/(annotations$area[annotations$Class == 
        "Stroma"] + annotations$area[annotations$Class == "Tumor"])
    return(annotations)
}

read_detection_measurements <- function(detection_measurement_file) {
    detections <- data.table::fread(detection_measurement_file)
    
}
annotation_table <- lapply(annotation_measurement_files, function(f) {
    raw_id <- parse_sample_ids(f)
    
    annotations <- read_annotation_measurements(f)
    data.frame(voa = raw_id, annotations)
}) %>% rbind.fill

annotation_table$condensed_id <- ithi.meta::map_id(annotation_table$voa, from = "voa", 
    to = "condensed_id", db_path)

Let’s check that the cellularity estimates from this are approximately correlated with those from WGS.

We don’t expect them to be exact, of course, given differences in cell size, i.e. DNA content-to-area ratio, but it should still be approximately correlated.

annotation_purity <- annotation_table %>% plyr::join(tumour_purity, type = "inner")
annotation_purity_tumour <- subset(annotation_purity, Class == "Tumor")
annotation_purity_tumour$patient_id <- as.character(annotation_purity_tumour$patient_id) %>% 
    ithi.meta::factor_id(type = "patient_id", db_path)
correlate_and_scatterplot <- function(df, xvar, yvar, xlab, ylab, annotation_colours, 
    log = "none") {
    df <- data.frame(df, stringsAsFactors = FALSE)
    corres <- cor.test(df[, xvar], df[, yvar], method = "spearman")
    corres_text <- as.character(as.expression(substitute(rho == sprho * "," * 
        ~~italic(P) == p, list(sprho = format(corres$estimate, digits = 3), 
        p = format(corres$p.value, digits = 3)))))
    
    p <- ggplot(df, aes_string(x = xvar, y = yvar)) + geom_point(aes(colour = patient_id)) + 
        theme_bw() + theme_Publication() + theme_nature() + scale_colour_manual(values = annotation_colours$patient_id) + 
        xlab(xlab) + ylab(ylab) + annotate(geom = "text", label = corres_text, 
        x = Inf, y = Inf, hjust = 1, vjust = 1, parse = TRUE)
    
    if (log == "x" | log == "xy") {
        p <- p + scale_x_continuous(trans = "log10", breaks = ithi.utils::log_scale_breaks(), 
            labels = ithi.utils::log_scale_labels())
    }
    
    if (log == "y" | log == "xy") {
        p <- p + scale_y_continuous(trans = "log10", breaks = ithi.utils::log_scale_breaks(), 
            labels = ithi.utils::log_scale_labels())
    }
    
    return(p)
}
correlate_and_scatterplot(annotation_purity_tumour, xvar = "tumour_content", 
    yvar = "area_proportion", xlab = "WGS cellularity", ylab = "% Area (H&E)", 
    annotation_colours)

They’re definitely correlated, but quite far from perfect. To some extent, we expected this, though.

Better method: tumour cells vs. all others

NOTE: This method assumes that non-lymphocytes in epithelial areas are tumour cells.

segmented_detections <- fread(segmented_detection_summary_file)
segmented_detection_summary <- segmented_detections %>% group_by(condensed_id) %>% 
    summarise(cellularity_he = nonlympho[masktype == "tumour"]/sum(nonlympho + 
        lympho))

segmented_detection_summary_purity <- segmented_detection_summary %>% plyr::join(tumour_purity, 
    type = "inner")
segmented_detection_summary_purity$patient_id <- ithi.meta::factor_id(segmented_detection_summary_purity$patient_id, 
    type = "patient_id", db_path)
correlate_and_scatterplot(segmented_detection_summary_purity, xvar = "tumour_content", 
    yvar = "cellularity_he", xlab = "WGS cellularity", ylab = "% Tumour cells (H&E)", 
    annotation_colours)

Compare: previous results (Yinyin’s lab)

image_summary1 <- read_he_image_summary(image_summary_file, db_path)
image_summary2 <- read_he_image_summary(image_summary_file2, db_path)

image_summary <- plyr::rbind.fill(list(image_summary1, image_summary2))

yinyin_cellularity <- image_summary %>% plyr::join(tumour_purity, type = "inner") %>% 
    plyr::rename(c(TumourCellRatio = "area_proportion"))
correlate_and_scatterplot(yinyin_cellularity, xvar = "tumour_content", yvar = "area_proportion", 
    xlab = "WGS cellularity", ylab = "% Area (H&E)", annotation_colours)

Correlation is a bit better. However, critically, we should note that the area estimates from this method were closer (in terms of their absolute values) to the real GWS cellularity estimates.

Question: Finn, for your results, is area computed with respect to the whole slide (i.e. a rectangle) or with respect to only those areas that have tissue? I’m assuming it’s the former and that’s why the highest samples only have ~40% area.

Lymphocyte proportion

Next thing to check is whether or not lymphocyte proportion is comparable to our measures from bulk expression/IHC.

detection_summary <- fread(detection_summary_file)
ihc_table$T_total_density_approx <- with(ihc_table, (T_CD8_count + T_CD4_count + 
    T_CD20_count + T_Plasma_count)/RUN1_T_area_pct)

detection_ihc <- detection_summary %>% plyr::join(ihc_table, type = "inner")
detection_ihc$patient_id <- ithi.meta::factor_id(detection_ihc$patient_id, type = "patient_id", 
    db_path)
correlate_and_scatterplot(detection_ihc, xvar = "T_total_density_approx", yvar = "lympho_prop", 
    xlab = "Total TIL density (approx)", ylab = "Lymphocyte proportion (H&E)", 
    annotation_colours, log = "x")

Cell parameters

Still have to do an exploration of what the cell parameters look like for each class of cell. More of a QC check.

Getis-Ord Gi Hotspots

hotspot_stats_til_clusters <- merge(hotspot_stats, til_clusters)
hotspot_stats_til_clusters_melted <- reshape2::melt(hotspot_stats_til_clusters, 
    id.vars = c("condensed_id", "mask", "cluster"), measure.vars = c("fc", "fi", 
        "fci"), variable.name = "measure", value.name = "stat")
pvals <- compute_pvals_subsets(hotspot_stats_til_clusters_melted, facet_vars = c("mask", 
    "measure"), formula = stat ~ cluster, corfun = kruskal.test)
ggplot(hotspot_stats_til_clusters_melted, aes(x = cluster, y = stat)) + geom_boxplot(outlier.size = -1) + 
    geom_jitter(position = position_jitter(width = 0.2, height = 0), alpha = 0.3) + 
    theme_bw() + facet_grid(measure ~ mask, scales = "free") + theme_Publication() + 
    theme_nature() + geom_text(data = pvals, aes(x = Inf, y = Inf, label = p.adj.text), 
    hjust = 1.1, vjust = 1.5, size = 2.5, parse = TRUE)

These statistics are:

  • fc = proportion of NonLympho hotspots that are also Lympho hotspots
  • fi = proportion of Lympho hotspots that are also NonLympho hotspots
  • fci = proportion of total mask area covered by NonLympho-Lympho hotspots (intersection)

The key here is to look at the tumour column – this corresponds to epithelial area, which is what we’re interested in.

P-values are adjusted for multiple testing w.r.t. all areas, including vascular and whitespace, but since those weren’t part of our hypothesis we can exclude those from the multiple testing in a future iteration.

A key assumption is that non-lymphocytes are cancer cells in epithelial areas and fibroblasts in stromal areas. Finn, do you think we can say this? Or would it be possible to include 3 classes in the RF model, i.e. cancer, lymphocyte, and other?

Key observations are:

  • no differences between TIL clusters for stromal – not unexpected as we don’t really have a strong reason to suspect a fibroblast-immune cell interaction that differs between TIL subtypes (you might be able to make a case for an S-TIL hypothesis, but this doesn’t seem to be supported by the data)
  • S-TIL has lower cancer-lymphocyte clustering than N-TIL or ES-TIL. Surprisingly enough N-TIL has comparable levels to ES-TIL. Doesn’t affect our interpretation TOO much as N-TILs have fewer TIL to start with, and this type of hotspot analysis looks for RELATIVE hotspots. So if you have only 5 TIL across a slide, seeing 2 of them close to each other may result in an area being called a hotspot in that slide.

Followups

One of the reviewers pointed out the idea of classifying N/S/ES-TIL for TCGA/ICGC samples on the basis of expression profiles. While we haven’t been able to do this accurately, it might be possible to do this off of the H&E images, e.g. from the Cancer Digital Slide Archive (which I’ve checked has TCGA-OV). This would allow us to validate our other measures on a larger cohort.

This is the next step for this line of analysis.

---
title: "qPath hotspot analysis"
---
                        ```{r, echo=FALSE, message=FALSE, warning=FALSE}

######## Snakemake header ########
library(methods)
Snakemake <- setClass(
    "Snakemake",
    slots = c(
        input = "list",
        output = "list",
        params = "list",
        wildcards = "list",
        threads = "numeric",
        log = "list",
        resources = "list",
        config = "list",
        rule = "character"
    )
)
snakemake <- Snakemake(
    input = list('/shahlab/alzhang/data/ithi/finn_results/he_output_Nov29', 'notebooks/qpath_hotspots.Rmd', '/shahlab/alzhang/pipeline_outputs/ith_immune/finnhe/run1', '/shahlab/alzhang/data/ithi/yuan_hecr_image_results_2.csv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/molsubtypes.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/ihc_table.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/tumour_purity.tsv', '/shahlab/alzhang/data/ithi/yuan_hecr_image_results.csv', "he_results_dir" = '/shahlab/alzhang/data/ithi/finn_results/he_output_Nov29', "notebook" = 'notebooks/qpath_hotspots.Rmd', "finnhe_pipeline_results_dir" = '/shahlab/alzhang/pipeline_outputs/ith_immune/finnhe/run1', "image_summary2" = '/shahlab/alzhang/data/ithi/yuan_hecr_image_results_2.csv', "molsubtypes" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/molsubtypes.tsv', "ihc_table" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/ihc_table.tsv', "tumour_purity_file" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/tumour_purity.tsv', "image_summary" = '/shahlab/alzhang/data/ithi/yuan_hecr_image_results.csv'),
    output = list('/shahlab/alzhang/projects/ITH_Immune/paper/results/review/notebooks/run2/qpath_hotspots.nb.html'),
    params = list(c('E_CD8_density', 'E_CD4_density', 'E_CD20_density', 'E_Plasma_density', 'S_CD8_density', 'S_CD4_density', 'S_CD20_density', 'S_Plasma_density'), '/shahlab/alzhang/projects/ITH_Immune/metadata/db/immune_project.sqlite3', 'qpath_hotspots_analysis', "tils_for_cluster" = c('E_CD8_density', 'E_CD4_density', 'E_CD20_density', 'E_Plasma_density', 'S_CD8_density', 'S_CD4_density', 'S_CD20_density', 'S_Plasma_density'), "db" = '/shahlab/alzhang/projects/ITH_Immune/metadata/db/immune_project.sqlite3', "name" = 'qpath_hotspots_analysis'),
    wildcards = list(),
    threads = 1,
    log = list('/shahlab/alzhang/clusttmp/paperreview2/notebooks/qpath_hotspots_analysis.log'),
    resources = list(),
    config = list("nanostring_data" = '/shahlab/alzhang/projects/ITH_Immune/results/nanostring_results/ith_full/qc/limma_quantile/normalized_expression_voa_labels_filtered.tsv', "til_clusters_output" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/intermediates/run2/til_clusters_output.txt', "igpartition_outdir" = '/shahlab/alzhang/pipeline_outputs/ith_immune/igpartition/run22', "icgc_subtypes" = '/shahlab/alzhang/data/ICGC/icgc_primary_tumour_subtypes.tsv', "array_expression_file" = '/shahlab/alzhang/projects/ITH_Immune/data/expression/array/gene_exprs_rma_batch_corrected.txt', "clone_branch_lengths" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/branch_data.tsv', "breakpoint_table" = '/shahlab/amcpherson/projects/ith3/ith3/notebooks/bespoke/ith_breakpoints.tsv', "all_tiltypes" = c('T_CD8_density', 'T_CD4_density', 'T_CD20_density', 'T_Plasma_density', 'E_CD8_density', 'E_CD4_density', 'E_CD20_density', 'E_Plasma_density', 'S_CD8_density', 'S_CD4_density', 'S_CD20_density', 'S_Plasma_density'), "ith_stats" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/ith_statistics.tsv', "bcr_diversity" = '/shahlab/alzhang/pipeline_outputs/ith_immune/mixcr/mixcr_runs/ith_1_2_3/mixcr5/postprocess/IGH/postfilter_diversity_stats/diversity.strict.resampled.txt', "ihc_features_output" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/intermediates/run2/ihc_features_output.txt', "somatic_coding_result_dir" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/somatic_coding_variants', "benchmarkdir" = '/shahlab/alzhang/benchmarks/paperreview2', "total_tiltypes" = c('T_CD8_density', 'T_CD4_density', 'T_CD20_density', 'T_Plasma_density'), "finnhe_pipeline_results_dir" = '/shahlab/alzhang/pipeline_outputs/ith_immune/finnhe/run1', "clola_result_file" = '/shahlab/alzhang/pipeline_outputs/ith_immune/clola/run4/clola_condensed_results/beta/clola_results.tsv', "molsubtypes" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/molsubtypes.tsv', "mmctm_final_patient_dir" = '/shahlab/alzhang/projects/ITH_Immune/results/mmctm_results/ith_by-patient_with-ov', "tils_for_cluster" = c('E_CD8_density', 'E_CD4_density', 'E_CD20_density', 'E_Plasma_density', 'S_CD8_density', 'S_CD4_density', 'S_CD20_density', 'S_Plasma_density'), "variability_type" = 'stabilize', "prevalence_threshold" = 0.01, "he_results_dir" = '/shahlab/alzhang/data/ithi/finn_results/he_output_Nov29', "distance_method" = 'horn', "tils_for_variability" = c('T_CD8_density', 'T_CD4_density', 'T_CD20_density', 'T_Plasma_density'), "xcr_table" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/xcr_table.tsv', "clone_trees" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/tree_data.tsv', "neoediting_outdir" = '/shahlab/alzhang/pipeline_outputs/ith_immune/neoediting/run6', "image_summary" = '/shahlab/alzhang/data/ithi/yuan_hecr_image_results.csv', "remixt_cellularity_ploidy" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/remixt_cellularity_ploidy.tsv', "logdir" = '/shahlab/alzhang/clusttmp/paperreview2', "ith_icgc_bc" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/ith_icgc_merged_bc.tsv', "tcr_diversity" = '/shahlab/alzhang/pipeline_outputs/ith_immune/mixcr/mixcr_runs/ith_1_2_3/mixcr5/postprocess/TRB/postfilter_diversity_stats/diversity.strict.resampled.txt', "ith_stat_types" = c('entropy', 'postprocessed_divergence', 'combined_ith_normalized', 'proportion_subclonal'), "refseq_gene_file" = '/shahlab/alzhang/data/genome/hg19/refseq_genes.bed', "db" = '/shahlab/alzhang/projects/ITH_Immune/metadata/db/immune_project.sqlite3', "snv_cluster_dir" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/snv_cluster', "notebook_dir" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/review/notebooks/run2', "image_summary2" = '/shahlab/alzhang/data/ithi/yuan_hecr_image_results_2.csv', "known_subtypes_array" = '/shahlab/alzhang/projects/ITH_Immune/data/expression/array/subtypes/known_subtypes.tsv', "nanostring_annotations" = '/shahlab/alzhang/projects/ITH_Immune/data/expression/nanostring/pancancer_annotations.tsv', "icgc_specimen" = '/shahlab/alzhang/data/ICGC/specimen.tsv', "clone_prevalences" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/clone_data.tsv', "copynumber_table" = '/shahlab/alzhang/data/ithi/master_copynumber_file.tsv', "rooney_mutsigcv_file" = '/shahlab/alzhang/projects/ITH_Immune/external/other_papers/mmc6.xlsx', "ihc_table" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/ihc_table.tsv', "table_dir" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/review/tables/run2', "epitopes_unique_filtered" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/epitopes_unique_filtered.tsv', "tumour_purity" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/tumour_purity.tsv', "patients_for_clonal" = c(1, 2, 3, 4, 7, 9, 10, 11, 12, 13, 14, 15, 16, 17), "tilcluster_supervised_ipynb" = '/shahlab/alzhang/projects/ITH_Immune/paper/review/ipy/tilcluster_supervisedmulticlass.ipynb', "snv_table" = '/shahlab/amcpherson/projects/ith3/ith3/notebooks/bespoke/ith_snvs.tsv'),
    rule = 'qpath_hotspots_analysis'
)
######## Original script #########

                        ```


## Setup

```{r global_chunk_options, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, tidy=TRUE, warning=FALSE, message=FALSE, cache=TRUE) #cache=TRUE
```

```{r}
library(ithi.utils)
load_base_libs()
library(ithi.meta)
library(ithi.spatial)
library(ithi.external)
```

```{r, eval = TRUE}
he_results_dir <- snakemake@input$he_results_dir
tumour_purity_file <- snakemake@input$tumour_purity_file
ihc_table_path <- snakemake@input$ihc_table
molecular_subtype_file <- snakemake@input$molsubtypes

image_summary_file <- snakemake@input$image_summary
image_summary_file2 <- snakemake@input$image_summary2
finnhe_pipeline_results_dir <- snakemake@input$finnhe_pipeline_results_dir

db_path <- snakemake@params$db
tils_for_cluster <- snakemake@params$tils_for_cluster
```

## Analysis

These are the results from the H&E image classifier. We're going to see if we can derive any features of interest to separate N/S/ES-TIL samples off of this. 

### Tumour/Stromal area

```{r}
annotation_colours <- ithi.figures::get_annotation_colours()

hotspot_stats_file <- file.path(finnhe_pipeline_results_dir, "overlap_stats_combined.tsv")
detection_summary_file <- file.path(finnhe_pipeline_results_dir, "detection_summary_combined.tsv")
segmented_detection_summary_file <- file.path(finnhe_pipeline_results_dir, "segmented_detection_summary_combined.tsv")

tumour_purity <- fread(tumour_purity_file)
annotation_measurement_files <- list.files(he_results_dir, pattern = "_annotation_measurements.txt", full.names = TRUE, recursive = FALSE)
detection_measurement_files <- list.files(he_results_dir, pattern = "_detection_measurements.txt", full.names = TRUE, recursive = FALSE)

ihc_table <- fread(ihc_table_path)
molsubtypes <- fread(molecular_subtype_file)

hotspot_stats <- fread(hotspot_stats_file)

til_clusters <- ithi.figures:::get_til_clusters(ihc_table, molsubtypes, tils_for_cluster = tils_for_cluster, nclusts = 3)
```

```{r}
read_annotation_measurements <- function(annotation_measurement_file) {
  annotations <- data.table::fread(annotation_measurement_file)
  annotations <- annotations %>% plyr::rename(c('Centroid X µm'='centroid_x', 'Centroid Y µm'='centroid_y',
                                                'Area µm^2'='area', 'Perimeter µm'='perimeter'))
  annotations$area_proportion <- annotations$area/(annotations$area[annotations$Class == "Stroma"] + annotations$area[annotations$Class == "Tumor"])
  return(annotations)
}

read_detection_measurements <- function(detection_measurement_file) {
  detections <- data.table::fread(detection_measurement_file)
  
}
```

```{r}
annotation_table <- lapply(annotation_measurement_files, function(f) {
  raw_id <- parse_sample_ids(f)
  
  annotations <- read_annotation_measurements(f)
  data.frame(voa=raw_id, annotations)
}) %>% rbind.fill

annotation_table$condensed_id <- ithi.meta::map_id(annotation_table$voa, from = "voa", to = "condensed_id", db_path)
```

Let's check that the cellularity estimates from this are approximately correlated with those from WGS. 

We don't expect them to be exact, of course, given differences in cell size, i.e. DNA content-to-area ratio, but it should still be approximately correlated. 



```{r}
annotation_purity <- annotation_table %>% plyr::join(tumour_purity, type = "inner")
```

```{r}
annotation_purity_tumour <- subset(annotation_purity, Class == "Tumor")
annotation_purity_tumour$patient_id <- as.character(annotation_purity_tumour$patient_id) %>% 
  ithi.meta::factor_id(type = "patient_id", db_path)
```

```{r}
correlate_and_scatterplot <- function(df, xvar, yvar, xlab, ylab, annotation_colours, log = "none") {
  df <- data.frame(df, stringsAsFactors = FALSE)
  corres <- cor.test(df[,xvar], df[,yvar], method = "spearman")
  corres_text <-as.character(as.expression(substitute(rho==sprho*","*~~italic(P)==p, list(sprho=format(corres$estimate, digits = 3),
                                                                                            p=format(corres$p.value, digits=3)))))
  
  p <- ggplot(df, aes_string(x=xvar, y = yvar)) + geom_point(aes(colour=patient_id)) + 
    theme_bw() + theme_Publication() + theme_nature() + scale_colour_manual(values = annotation_colours$patient_id) + 
    xlab(xlab) + ylab(ylab) + 
    annotate(geom = "text", label = corres_text, x = Inf, y = Inf, hjust = 1, vjust = 1, parse = TRUE)
  
  if (log == "x" | log == "xy") {
    p <- p + scale_x_continuous(trans = "log10", breaks = ithi.utils::log_scale_breaks(), 
                                labels = ithi.utils::log_scale_labels())
  }
  
  if (log == "y" | log == "xy") {
    p <- p + scale_y_continuous(trans = "log10", breaks = ithi.utils::log_scale_breaks(), 
                                labels = ithi.utils::log_scale_labels())
  }
  
  return(p)
}
```

```{r}
correlate_and_scatterplot(annotation_purity_tumour, xvar = "tumour_content", yvar = "area_proportion", xlab = "WGS cellularity", ylab = "% Area (H&E)", annotation_colours)
```

They're definitely correlated, but quite far from perfect. To some extent, we expected this, though. 


#### Better method: tumour cells vs. all others

**NOTE**: This method assumes that non-lymphocytes in epithelial areas are tumour cells. 

```{r}
segmented_detections <- fread(segmented_detection_summary_file)
segmented_detection_summary <- segmented_detections %>% group_by(condensed_id) %>% summarise(cellularity_he = nonlympho[masktype == "tumour"]/sum(nonlympho + lympho))

segmented_detection_summary_purity <- segmented_detection_summary %>% plyr::join(tumour_purity, type = "inner")
segmented_detection_summary_purity$patient_id <- ithi.meta::factor_id(segmented_detection_summary_purity$patient_id, type = "patient_id", db_path)
```

```{r}
correlate_and_scatterplot(segmented_detection_summary_purity, xvar = "tumour_content", yvar = "cellularity_he", xlab = "WGS cellularity", ylab = "% Tumour cells (H&E)", annotation_colours)
```


#### Compare: previous results (Yinyin's lab)

```{r}
image_summary1 <- read_he_image_summary(image_summary_file, db_path) 
image_summary2 <- read_he_image_summary(image_summary_file2, db_path)

image_summary <- plyr::rbind.fill(list(image_summary1, image_summary2))

yinyin_cellularity <- image_summary %>% plyr::join(tumour_purity, type = 'inner') %>% plyr::rename(c('TumourCellRatio'='area_proportion'))
```

```{r}
correlate_and_scatterplot(yinyin_cellularity, xvar = "tumour_content", yvar = "area_proportion", xlab = "WGS cellularity", ylab = "% Area (H&E)", annotation_colours)
```

Correlation is a bit better. However, critically, we should note that the area estimates from this method were closer (in terms of their absolute values) to the real GWS cellularity estimates. 

**Question**: Finn, for your results, is area computed with respect to the whole slide (i.e. a rectangle) or with respect to only those areas that have tissue? I'm assuming it's the former and that's why the highest samples only have ~40\% area. 

### Lymphocyte proportion

Next thing to check is whether or not lymphocyte proportion is comparable to our measures from bulk expression/IHC. 

```{r}
detection_summary <- fread(detection_summary_file)
```

```{r}
ihc_table$T_total_density_approx <- with(ihc_table, (T_CD8_count + T_CD4_count + T_CD20_count + T_Plasma_count)/RUN1_T_area_pct)

detection_ihc <- detection_summary %>% plyr::join(ihc_table, type = 'inner')
detection_ihc$patient_id <- ithi.meta::factor_id(detection_ihc$patient_id, type = "patient_id", db_path)
```

```{r}
correlate_and_scatterplot(detection_ihc, xvar = "T_total_density_approx", yvar = "lympho_prop", xlab = "Total TIL density (approx)", ylab = "Lymphocyte proportion (H&E)", annotation_colours, log = "x")
```

### Cell parameters

Still have to do an exploration of what the cell parameters look like for each class of cell. More of a QC check. 

### Getis-Ord Gi Hotspots

```{r}
hotspot_stats_til_clusters <- merge(hotspot_stats, til_clusters)
hotspot_stats_til_clusters_melted <- reshape2::melt(hotspot_stats_til_clusters, id.vars = c("condensed_id", "mask", "cluster"),
                                                    measure.vars = c("fc", "fi", "fci"), variable.name = "measure",
                                                    value.name = "stat")
```

```{r}
pvals <- compute_pvals_subsets(hotspot_stats_til_clusters_melted, facet_vars = c("mask", "measure"), formula = stat ~ cluster,
                               corfun = kruskal.test)
```

```{r, fig.height = 5, fig.width = 7}
ggplot(hotspot_stats_til_clusters_melted, aes(x=cluster, y = stat)) + geom_boxplot(outlier.size = -1) + geom_jitter(position = position_jitter(width = 0.2, height = 0), alpha = 0.3) + theme_bw() + facet_grid(measure ~ mask, scales = "free") + 
  theme_Publication() + theme_nature() + geom_text(data=pvals, aes(x=Inf, y=Inf, label=p.adj.text), hjust=1.1, vjust=1.5,size=2.5,parse=TRUE)
```

These statistics are:

* fc = proportion of NonLympho hotspots that are also Lympho hotspots
* fi = proportion of Lympho hotspots that are also NonLympho hotspots
* fci = proportion of total mask area covered by NonLympho-Lympho hotspots (intersection)

The key here is to look at the tumour column -- this corresponds to epithelial area, which is what we're interested in. 

P-values are adjusted for multiple testing w.r.t. all areas, including vascular and whitespace, but since those weren't part of our hypothesis we can exclude those from the multiple testing in a future iteration. 

A key assumption is that non-lymphocytes are cancer cells in epithelial areas and fibroblasts in stromal areas. Finn, do you think we can say this? Or would it be possible to include 3 classes in the RF model, i.e. cancer, lymphocyte, and other? 

Key observations are:

* no differences between TIL clusters for stromal -- not unexpected as we don't really have a strong reason to suspect a fibroblast-immune cell interaction that differs between TIL subtypes (you might be able to make a case for an S-TIL hypothesis, but this doesn't seem to be supported by the data)
* S-TIL has lower cancer-lymphocyte clustering than N-TIL or ES-TIL. Surprisingly enough N-TIL has comparable levels to ES-TIL. Doesn't affect our interpretation TOO much as N-TILs have fewer TIL to start with, and this type of hotspot analysis looks for RELATIVE hotspots. So if you have only 5 TIL across a slide, seeing 2 of them close to each other may result in an area being called a hotspot in that slide. 


### Followups

One of the reviewers pointed out the idea of classifying N/S/ES-TIL for TCGA/ICGC samples on the basis of expression profiles. While we haven't been able to do this accurately, it might be possible to do this off of the H&E images, e.g. from the Cancer Digital Slide Archive (which I've checked has TCGA-OV). This would allow us to validate our other measures on a larger cohort.

This is the next step for this line of analysis. 



