library(ithi.utils)
load_base_libs()
library(ithi.meta)
library(ithi.spatial)
library(ithi.external)
he_results_dir <- snakemake@input$he_results_dir
tumour_purity_file <- snakemake@input$tumour_purity_file
ihc_table_path <- snakemake@input$ihc_table
molecular_subtype_file <- snakemake@input$molsubtypes
image_summary_file <- snakemake@input$image_summary
image_summary_file2 <- snakemake@input$image_summary2
finnhe_pipeline_results_dir <- snakemake@input$finnhe_pipeline_results_dir
db_path <- snakemake@params$db
tils_for_cluster <- snakemake@params$tils_for_cluster
These are the results from the H&E image classifier. We’re going to see if we can derive any features of interest to separate N/S/ES-TIL samples off of this.
annotation_colours <- ithi.figures::get_annotation_colours()
hotspot_stats_file <- file.path(finnhe_pipeline_results_dir, "overlap_stats_combined.tsv")
detection_summary_file <- file.path(finnhe_pipeline_results_dir, "detection_summary_combined.tsv")
segmented_detection_summary_file <- file.path(finnhe_pipeline_results_dir, "segmented_detection_summary_combined.tsv")
tumour_purity <- fread(tumour_purity_file)
annotation_measurement_files <- list.files(he_results_dir, pattern = "_annotation_measurements.txt",
full.names = TRUE, recursive = FALSE)
detection_measurement_files <- list.files(he_results_dir, pattern = "_detection_measurements.txt",
full.names = TRUE, recursive = FALSE)
ihc_table <- fread(ihc_table_path)
molsubtypes <- fread(molecular_subtype_file)
hotspot_stats <- fread(hotspot_stats_file)
til_clusters <- ithi.figures:::get_til_clusters(ihc_table, molsubtypes, tils_for_cluster = tils_for_cluster,
nclusts = 3)
read_annotation_measurements <- function(annotation_measurement_file) {
annotations <- data.table::fread(annotation_measurement_file)
annotations <- annotations %>% plyr::rename(c(`Centroid X µm` = "centroid_x",
`Centroid Y µm` = "centroid_y", `Area µm^2` = "area", `Perimeter µm` = "perimeter"))
annotations$area_proportion <- annotations$area/(annotations$area[annotations$Class ==
"Stroma"] + annotations$area[annotations$Class == "Tumor"])
return(annotations)
}
read_detection_measurements <- function(detection_measurement_file) {
detections <- data.table::fread(detection_measurement_file)
}
annotation_table <- lapply(annotation_measurement_files, function(f) {
raw_id <- parse_sample_ids(f)
annotations <- read_annotation_measurements(f)
data.frame(voa = raw_id, annotations)
}) %>% rbind.fill
annotation_table$condensed_id <- ithi.meta::map_id(annotation_table$voa, from = "voa",
to = "condensed_id", db_path)
Let’s check that the cellularity estimates from this are approximately correlated with those from WGS.
We don’t expect them to be exact, of course, given differences in cell size, i.e. DNA content-to-area ratio, but it should still be approximately correlated.
annotation_purity <- annotation_table %>% plyr::join(tumour_purity, type = "inner")
annotation_purity_tumour <- subset(annotation_purity, Class == "Tumor")
annotation_purity_tumour$patient_id <- as.character(annotation_purity_tumour$patient_id) %>%
ithi.meta::factor_id(type = "patient_id", db_path)
correlate_and_scatterplot <- function(df, xvar, yvar, xlab, ylab, annotation_colours,
log = "none") {
df <- data.frame(df, stringsAsFactors = FALSE)
corres <- cor.test(df[, xvar], df[, yvar], method = "spearman")
corres_text <- as.character(as.expression(substitute(rho == sprho * "," *
~~italic(P) == p, list(sprho = format(corres$estimate, digits = 3),
p = format(corres$p.value, digits = 3)))))
p <- ggplot(df, aes_string(x = xvar, y = yvar)) + geom_point(aes(colour = patient_id)) +
theme_bw() + theme_Publication() + theme_nature() + scale_colour_manual(values = annotation_colours$patient_id) +
xlab(xlab) + ylab(ylab) + annotate(geom = "text", label = corres_text,
x = Inf, y = Inf, hjust = 1, vjust = 1, parse = TRUE)
if (log == "x" | log == "xy") {
p <- p + scale_x_continuous(trans = "log10", breaks = ithi.utils::log_scale_breaks(),
labels = ithi.utils::log_scale_labels())
}
if (log == "y" | log == "xy") {
p <- p + scale_y_continuous(trans = "log10", breaks = ithi.utils::log_scale_breaks(),
labels = ithi.utils::log_scale_labels())
}
return(p)
}
correlate_and_scatterplot(annotation_purity_tumour, xvar = "tumour_content",
yvar = "area_proportion", xlab = "WGS cellularity", ylab = "% Area (H&E)",
annotation_colours)
They’re definitely correlated, but quite far from perfect. To some extent, we expected this, though.
NOTE: This method assumes that non-lymphocytes in epithelial areas are tumour cells.
segmented_detections <- fread(segmented_detection_summary_file)
segmented_detection_summary <- segmented_detections %>% group_by(condensed_id) %>%
summarise(cellularity_he = nonlympho[masktype == "tumour"]/sum(nonlympho +
lympho))
segmented_detection_summary_purity <- segmented_detection_summary %>% plyr::join(tumour_purity,
type = "inner")
segmented_detection_summary_purity$patient_id <- ithi.meta::factor_id(segmented_detection_summary_purity$patient_id,
type = "patient_id", db_path)
correlate_and_scatterplot(segmented_detection_summary_purity, xvar = "tumour_content",
yvar = "cellularity_he", xlab = "WGS cellularity", ylab = "% Tumour cells (H&E)",
annotation_colours)
image_summary1 <- read_he_image_summary(image_summary_file, db_path)
image_summary2 <- read_he_image_summary(image_summary_file2, db_path)
image_summary <- plyr::rbind.fill(list(image_summary1, image_summary2))
yinyin_cellularity <- image_summary %>% plyr::join(tumour_purity, type = "inner") %>%
plyr::rename(c(TumourCellRatio = "area_proportion"))
correlate_and_scatterplot(yinyin_cellularity, xvar = "tumour_content", yvar = "area_proportion",
xlab = "WGS cellularity", ylab = "% Area (H&E)", annotation_colours)
Correlation is a bit better. However, critically, we should note that the area estimates from this method were closer (in terms of their absolute values) to the real GWS cellularity estimates.
Question: Finn, for your results, is area computed with respect to the whole slide (i.e. a rectangle) or with respect to only those areas that have tissue? I’m assuming it’s the former and that’s why the highest samples only have ~40% area.
Next thing to check is whether or not lymphocyte proportion is comparable to our measures from bulk expression/IHC.
detection_summary <- fread(detection_summary_file)
ihc_table$T_total_density_approx <- with(ihc_table, (T_CD8_count + T_CD4_count +
T_CD20_count + T_Plasma_count)/RUN1_T_area_pct)
detection_ihc <- detection_summary %>% plyr::join(ihc_table, type = "inner")
detection_ihc$patient_id <- ithi.meta::factor_id(detection_ihc$patient_id, type = "patient_id",
db_path)
correlate_and_scatterplot(detection_ihc, xvar = "T_total_density_approx", yvar = "lympho_prop",
xlab = "Total TIL density (approx)", ylab = "Lymphocyte proportion (H&E)",
annotation_colours, log = "x")
Still have to do an exploration of what the cell parameters look like for each class of cell. More of a QC check.
hotspot_stats_til_clusters <- merge(hotspot_stats, til_clusters)
hotspot_stats_til_clusters_melted <- reshape2::melt(hotspot_stats_til_clusters,
id.vars = c("condensed_id", "mask", "cluster"), measure.vars = c("fc", "fi",
"fci"), variable.name = "measure", value.name = "stat")
pvals <- compute_pvals_subsets(hotspot_stats_til_clusters_melted, facet_vars = c("mask",
"measure"), formula = stat ~ cluster, corfun = kruskal.test)
ggplot(hotspot_stats_til_clusters_melted, aes(x = cluster, y = stat)) + geom_boxplot(outlier.size = -1) +
geom_jitter(position = position_jitter(width = 0.2, height = 0), alpha = 0.3) +
theme_bw() + facet_grid(measure ~ mask, scales = "free") + theme_Publication() +
theme_nature() + geom_text(data = pvals, aes(x = Inf, y = Inf, label = p.adj.text),
hjust = 1.1, vjust = 1.5, size = 2.5, parse = TRUE)
These statistics are:
The key here is to look at the tumour column – this corresponds to epithelial area, which is what we’re interested in.
P-values are adjusted for multiple testing w.r.t. all areas, including vascular and whitespace, but since those weren’t part of our hypothesis we can exclude those from the multiple testing in a future iteration.
A key assumption is that non-lymphocytes are cancer cells in epithelial areas and fibroblasts in stromal areas. Finn, do you think we can say this? Or would it be possible to include 3 classes in the RF model, i.e. cancer, lymphocyte, and other?
Key observations are:
One of the reviewers pointed out the idea of classifying N/S/ES-TIL for TCGA/ICGC samples on the basis of expression profiles. While we haven’t been able to do this accurately, it might be possible to do this off of the H&E images, e.g. from the Cancer Digital Slide Archive (which I’ve checked has TCGA-OV). This would allow us to validate our other measures on a larger cohort.
This is the next step for this line of analysis.