library(ithi.utils)
load_base_libs()
library(methods)
library(xgboost)
library(splitstackshape)
library(mclust)
library(entropy)
library(class)
library(HiDimDA)
library(cluster)
library(limma)
library(gage)
library(pathview)
library(MASS)
library(pcaPP) ## for rrlda
library(mvoutlier) ## for rrlda
library(rrlda)
library(ithi.meta)
library(ithi.figures)
library(ithi.utils)
library(ithi.expr)
lower.triangle <- lower.tri ## because rrlda is poorly implemented
ihc_table_path <- snakemake@input$ihc_table
nanostring_data_path <- snakemake@input$nanostring_data
nanostring_annotations_path <- snakemake@input$nanostring_annotations
molecular_subtype_file <- snakemake@input$molsubtypes
gene_expression_array_file <- snakemake@input$array_expression_file
array_known_subtype_file <- snakemake@input$known_subtypes_array
tils_for_cluster <- snakemake@params$tils_for_cluster
db_path <- snakemake@params$db
annotation_colours <- ithi.figures::get_annotation_colours()
exprs <- fread(nanostring_data_path)
nanostring_labels <- fread(nanostring_annotations_path)
ihc_table <- fread(ihc_table_path)
molsubtypes <- fread(molecular_subtype_file)
til_clusters <- ithi.figures:::get_til_clusters(ihc_table, molsubtypes, tils_for_cluster = tils_for_cluster,
nclusts = 3)
array_expression_raw <- fread(gene_expression_array_file) %>% as.data.frame %>%
tibble::column_to_rownames(var = "V1")
known_subtype_table <- fread(array_known_subtype_file)
colnames(array_expression_raw) <- ithi.meta::df_as_map(known_subtype_table,
colnames(array_expression_raw), from = "master_id", to = "condensed_id")
Our goal here is to see if we can classify N/S/ES-TIL on the basis of expression profiles.
First, I’d like to emphasize that we only have the following data:
And we are unable to get sufficient quality/quantity of RNA from these samples to do full RNA-seq.
So our data likely may not be sufficient to explore this problem in any sort of depth. Also consider that we have samples from the same patient – so we’re going to have to do be careful not to overfit.
In this file, we’ll first work off of the Nanostring data, since we have more samples with this. If we can find something good here, then that’d be ideal.
X <- exprs %>% subset(select = -c(Code.Class, Accession)) %>% as.data.frame %>%
column_to_rownames(var = "Name") %>% t %>% as.data.frame
y <- til_clusters
rownames(X) <- rownames(X) %>% ithi.meta::map_id(from = "voa", to = "condensed_id",
db_path)
shared_samples <- intersect(rownames(X), y$condensed_id)
X <- X[shared_samples, ]
y <- (y %>% column_to_rownames(var = "condensed_id"))[shared_samples, , drop = FALSE]
X_lab <- X %>% rownames_to_column(var = "condensed_id")
X_lab$patient_id <- ithi.meta::map_id(X_lab$condensed_id, from = "condensed_id",
to = "patient_id", db_path)
X_lab$subtype <- y$cluster
create_data_object <- function(X, y, samples) {
data <- X[samples, ] %>% as.matrix
labels <- y[samples, , drop = FALSE]$cluster %>% plyr::mapvalues(from = c("N-TIL",
"S-TIL", "ES-TIL"), to = c(0:2)) %>% as.character %>% as.numeric
d <- xgb.DMatrix(data = data, label = labels)
return(d)
}
run_tilclassifier_cv <- function(X, y, X_lab, type = "xgboost") {
reserved_patients <- X_lab$patient_id %>% unique %>% sample(size = 3)
X_lab_unreserved <- subset(X_lab, !patient_id %in% reserved_patients)
if (type == "xgboost") {
X_lab_test <- splitstackshape::stratified(X_lab_unreserved, c("patient_id",
"subtype"), 0.3)
test_samples <- X_lab_test$condensed_id
train_samples <- setdiff(X_lab_unreserved$condensed_id, test_samples)
reserved_samples <- setdiff(rownames(X), c(train_samples, test_samples))
dtrain <- create_data_object(X, y, train_samples)
dtest <- create_data_object(X, y, test_samples)
dres <- create_data_object(X, y, reserved_samples)
watchlist <- list(train = dtrain, test = dtest)
xgbres <- xgb.train(params = list(num_class = 3), data = dtrain, max_depth = 2,
eta = 1, nthread = 2, nrounds = 500, watchlist = watchlist, objective = "multi:softmax",
verbose = FALSE)
pred <- predict(xgbres, dres)
dres_realvals <- y[reserved_samples, , drop = FALSE]$cluster %>% plyr::mapvalues(from = c("N-TIL",
"S-TIL", "ES-TIL"), to = c(0:2)) %>% as.character %>% as.numeric
} else if (type %in% c("knn", "dlda", "lda", "rrlda")) {
train_samples <- X_lab_unreserved$condensed_id
reserved_samples <- setdiff(rownames(X), c(train_samples))
X_train <- X[train_samples, ]
X_test <- X[reserved_samples, ]
y_train <- y[train_samples, , drop = FALSE]$cluster %>% plyr::mapvalues(from = c("N-TIL",
"S-TIL", "ES-TIL"), to = c(0:2)) %>% as.character %>% as.numeric
y_test <- y[reserved_samples, , drop = FALSE]$cluster %>% plyr::mapvalues(from = c("N-TIL",
"S-TIL", "ES-TIL"), to = c(0:2)) %>% as.character %>% as.numeric
if (type == "knn") {
pred <- class::knn(X_train, X_test, y_train, k = 3) %>% as.character %>%
as.numeric
} else if (type == "dlda") {
model <- HiDimDA::Dlda(X_train, factor(y_train))
pred <- predict(model, X_test)$class %>% unname %>% as.character %>%
as.numeric
} else if (type == "lda") {
model <- MASS::lda(X_train, factor(y_train))
pred <- predict(model, X_test)$class %>% unname %>% as.character %>%
as.numeric
} else if (type == "rrlda") {
model <- rrlda::rrlda(X_train, factor(y_train))
pred <- predict(model, X_test)$class %>% unname %>% as.character %>%
as.numeric
}
dres_realvals <- y_test
}
data.frame(condensed_id = reserved_samples, prediction = pred, actual = dres_realvals)
}
xgb_results <- lapply(1:10, function(i) run_tilclassifier_cv(X, y, X_lab, type = "xgboost")) %>%
rbind.fill
with(xgb_results, mclust::adjustedRandIndex(prediction, actual))
[1] 0.2453873
entropy::mi.empirical(with(xgb_results, table(prediction, actual)))
[1] 0.2055565
So clearly XGBoost isn’t going to cut it. ARI’s and mutual information from cross-validation is quite low (definitely < 0.5 on repeat runs). This isn’t necessarily surprising given that XGBoost is intended for more data-rich problems. That being said, we’ll attempt other methods that have been applied to smaller datasets, such as the 2 below.
knn_results <- lapply(1:10, function(i) run_tilclassifier_cv(X, y, X_lab, type = "knn")) %>%
rbind.fill
with(knn_results, mclust::adjustedRandIndex(prediction, actual))
[1] 0.1432024
entropy::mi.empirical(with(knn_results, table(prediction, actual)))
[1] 0.1690655
with(knn_results, table(prediction, actual))
actual
prediction 0 1 2
0 43 12 9
1 21 22 17
2 7 1 28
dlda_results <- lapply(1:10, function(i) run_tilclassifier_cv(X, y, X_lab, type = "dlda")) %>%
rbind.fill
with(dlda_results, mclust::adjustedRandIndex(prediction, actual))
[1] 0.4687117
entropy::mi.empirical(with(dlda_results, table(prediction, actual)))
[1] 0.4443378
with(dlda_results, table(prediction, actual))
actual
prediction 0 1 2
0 60 6 3
1 10 46 10
2 4 6 39
So doing this with Nanostring doesn’t look like it will cut it. With mediocre ARIs and terrible mutual informations, the confidence we have to robustly predict these TIL subtypes in an external subtype is low.
Now we’ll try this from array (U133A2) expression data.
dim(array_expression_raw)
[1] 12438 63
array_expression_raw_subset <- array_expression_raw[, intersect(colnames(array_expression_raw),
til_clusters$condensed_id)]
dim(array_expression_raw_subset)
[1] 12438 44
We’re losing nearly a third of our dataset because we don’t have TIL subtypes for a handful of samples (IHC/Nanostring/etc. was not performed). This was because tissue samples were not available for many of the older tumour samples (i.e. those from ITH-1). This is unfortunate, but there’s not much we can do about it.
The first thing to do is to make sure we have a chance of getting reasonable results.
til_clusters_subset <- til_clusters %>% subset(til_clusters$condensed_id %in%
colnames(array_expression_raw_subset))
rownames(til_clusters_subset) <- NULL
til_clusters_subset <- til_clusters_subset %>% column_to_rownames(var = "condensed_id")
til_clusters_subset <- til_clusters_subset[colnames(array_expression_raw_subset),
, drop = FALSE]
til_clusters_subset$cluster <- til_clusters_subset$cluster %>% str_replace("\\-",
"_")
til_clusters_subset$patient_id <- ithi.meta::map_id(rownames(til_clusters_subset),
from = "condensed_id", to = "patient_id", db_path)
## Could (and should later) add patients to this model matrix Actually, I
## don't think there's an easy way of doing this
## (https://support.bioconductor.org/p/6655/) -- limma I don't think supports
## random effect models
design <- model.matrix(~0 + til_clusters_subset$cluster)
colnames(design) <- str_replace(colnames(design), "til_clusters_subset\\$cluster",
"")
colnames(design) <- str_replace(colnames(design), "til_clusters_subset\\$",
"")
contrast.matrix <- limma::makeContrasts((S_TIL + ES_TIL)/2 - N_TIL, ES_TIL -
S_TIL, levels = design)
patient_random_effect <- TRUE
if (patient_random_effect) {
corfit <- duplicateCorrelation(array_expression_raw_subset, design, block = as.character(til_clusters_subset$patient_id))
fit1 <- lmFit(array_expression_raw_subset, design, block = as.character(til_clusters_subset$patient_id),
correlation = corfit$consensus.correlation)
} else {
fit1 <- lmFit(array_expression_raw_subset, design)
}
fit1_contrasts <- limma::contrasts.fit(fit1, contrast.matrix)
fit1_contrasts <- limma::eBayes(fit1_contrasts)
hot_genes <- limma::topTable(fit1_contrasts, coef = "(S_TIL + ES_TIL)/2 - N_TIL",
adjust.method = "BH", number = Inf, sort = "p")
nonstromal_genes <- limma::topTable(fit1_contrasts, coef = "ES_TIL - S_TIL",
adjust.method = "BH", number = Inf, sort = "p")
hot_pathways <- ithi.expr::pathway_analysis(hot_genes, gs_type = "kegg", eg_header = "EntrezGene")
hot_pathways
As expected, antigen presentation and cytotoxicity, etc. pop up pretty high on this list.
nonstromal_pathways <- ithi.expr::pathway_analysis(nonstromal_genes, gs_type = "kegg",
eg_header = "EntrezGene")
nonstromal_pathways
And here, we see ECM-associated pathways (focal adhesion and ECM-receptor interaction) pop up as downregulated in ES-TIL relative to S-TIL, consistent with the enrichment of C1 molecular subtype tumours in S-TIL.
Next, we’d like to design a classifier based on the array expression data.
Do note that we are extremely restricted in our ability to train/test on this data due to the size of the dataset.
Since we’re dealing with a dataset with lots of features – that are most definitely correlated with one another – and a small dataset, we’re better off looking at methods like LDA. But first, we’ll want to reduce our feature space to a number of manageable features.
The first thing we’ll do is select genes that demonstrate variability across our dataset. We’ll start by picking 1000 genes with the highest MAD values.
n_top_genes <- 1000
mad_values <- apply(array_expression_raw_subset, 1, mad)
top_genes <- names(sort(mad_values, decreasing = TRUE))[1:n_top_genes]
Now we standardize and apply PCA.
expr_pca <- prcomp(t(array_expression_raw_subset[top_genes, ]), center = TRUE,
scale. = TRUE)
And finally, taking the top 30 PCs and using LDA:
num_pcs <- 30
X <- expr_pca$x[, 1:num_pcs]
y <- til_clusters
shared_samples <- intersect(rownames(X), y$condensed_id)
X <- X[shared_samples, ]
y <- (y %>% column_to_rownames(var = "condensed_id"))[shared_samples, , drop = FALSE]
X_lab <- X %>% as.data.frame %>% rownames_to_column(var = "condensed_id")
X_lab$patient_id <- ithi.meta::map_id(X_lab$condensed_id, from = "condensed_id",
to = "patient_id", db_path)
X_lab$subtype <- y$cluster
array_lda_results <- lapply(1:10, function(i) run_tilclassifier_cv(X, y, X_lab,
type = "lda")) %>% rbind.fill
with(array_lda_results, mclust::adjustedRandIndex(prediction, actual))
[1] 0.04203888
entropy::mi.empirical(with(array_lda_results, table(prediction, actual)))
[1] 0.04790647
with(array_lda_results, table(prediction, actual))
actual
prediction 0 1 2
0 10 8 10
1 7 7 24
2 16 5 13
We’re not doing so well here.
array_dlda_results <- lapply(1:10, function(i) run_tilclassifier_cv(X, y, X_lab,
type = "dlda")) %>% rbind.fill
with(array_dlda_results, mclust::adjustedRandIndex(prediction, actual))
[1] 0.04161488
entropy::mi.empirical(with(array_dlda_results, table(prediction, actual)))
[1] 0.05539707
with(array_dlda_results, table(prediction, actual))
actual
prediction 0 1 2
0 23 12 11
1 1 3 5
2 18 6 23
Again, disappointing performance. But perhaps not surprising given how small this dataset is.
An alternative to explicitly selecting features with PCA is to regularize. Others (Huang, Cabral, and de la Torre 2015) have shown that robust regularized LDA outperforms PCA-LDA in some scenarios.
We can try this too with the rrlda package in R.
Unfortunately we can’t use all 1000 most variable genes unless we want to be waiting half an afternoon, cause this method isn’t fast.
num_top_subset <- 200
X <- t(array_expression_raw_subset[top_genes[1:num_top_subset], ])
y <- til_clusters
shared_samples <- intersect(rownames(X), y$condensed_id)
X <- X[shared_samples, ]
y <- (y %>% column_to_rownames(var = "condensed_id"))[shared_samples, , drop = FALSE]
X_lab <- X %>% as.data.frame %>% rownames_to_column(var = "condensed_id")
X_lab$patient_id <- ithi.meta::map_id(X_lab$condensed_id, from = "condensed_id",
to = "patient_id", db_path)
X_lab$subtype <- y$cluster
array_rrlda_results <- lapply(1:10, function(i) run_tilclassifier_cv(X, y, X_lab,
type = "rrlda")) %>% rbind.fill
with(array_rrlda_results, mclust::adjustedRandIndex(prediction, actual))
[1] 0.03365307
entropy::mi.empirical(with(array_rrlda_results, table(prediction, actual)))
[1] 0.03669907
with(array_rrlda_results, table(prediction, actual))
actual
prediction 0 1 2
0 29 7 14
1 3 5 3
2 27 8 27
As this shows, nothing seems to look good for the array expression data. But this is not necessarily surprising, given the lack of samples we have to work with in designing a classifier.
It’s possible that we may be able to design a classifier off of H&E data (epithelial and stromal lymphocyte counts + cancer cell-lymphocyte colocalization). To do so, we’re going to need to show:
But that’s an analysis that’ll be described in another file, after Finn’s pipeline completes on TCGA.