ihc_table_path <- snakemake@input$ihc_table
xcr_table_path <- snakemake@input$xcr_table
molecular_subtype_file <- snakemake@input$molsubtypes
graph_rds_path <- snakemake@input$xcr_graphs
db_path <- snakemake@params$db
tils_for_cluster <- unlist(snakemake@params$tils_for_cluster)
all_tiltypes <- unlist(snakemake@params$all_tiltypes)
library(ithi.utils)
load_base_libs()
library(igraph)
library(tidygraph)
library(ggraph)
library(entropy)
library(vegan)
library(stepPlr)
library(nnet)
library(caret)
library(ithi.meta)
library(ithi.xcr)
annotation_colours <- ithi.figures::get_annotation_colours()
ihc_table <- fread(ihc_table_path)
xcr_table <- read_clonotypes(xcr_table_path, duplicates = FALSE, db_path)
Read 0.0% of 304822 rows
Read 32.8% of 304822 rows
Read 72.2% of 304822 rows
Read 304822 rows and 18 (of 18) columns from 0.070 GB file in 00:00:06
molsubtypes <- fread(molecular_subtype_file)
til_clusters <- ithi.figures:::get_til_clusters(ihc_table, molsubtypes, tils_for_cluster = tils_for_cluster,
nclusts = 3)
til_clusters$patient_id <- ithi.meta::map_id(til_clusters$condensed_id, from = "condensed_id",
to = "patient_id", db_path)
The structure of these graphs is:
## Refactor this if it works Probably best to plot top X per sample -- works
## more effectively
construct_xcr_graph <- function(xcr_clones, nodes = "xcrclone", top_n = Inf) {
sample_membership <- xcr_clones %>% group_by(id) %>% do(samples = unique(as.character(.$condensed_id)),
total_count = sum(.$count), total_freq = sum(.$count)/sum(xcr_clones$count))
sample_membership$total_count <- unlist(sample_membership$total_count)
sample_membership$total_freq <- unlist(sample_membership$total_freq)
sample_membership$numeric_id <- paste0("id_", factor(sample_membership$id) %>%
as.numeric)
top_n <- min(top_n, nrow(sample_membership))
sample_membership_shared <- sample_membership[sapply(sample_membership$samples,
length) > 1, ]
sample_membership_shared <- sample_membership_shared[order(sample_membership_shared$total_freq),
][1:top_n, ]
sample_membership_shared$index <- as.numeric(factor(sample_membership_shared$numeric_id,
levels = sort(unique(sample_membership_shared$numeric_id))))
nodes <- sample_membership_shared %>% subset(select = c("index", "numeric_id",
"id")) %>% plyr::rename(c(id = "clonotype"))
unique_samples <- xcr_clones$condensed_id %>% unique %>% as.character
sample_pairs <- combn(unique_samples, 2)
edges <- lapply(1:ncol(sample_pairs), function(i) {
pair <- sample_pairs[, i]
contained_nodes <- sample_membership_shared[sapply(sample_membership_shared$samples,
function(x) all(pair %in% x)), ]
if (length(contained_nodes$index) > 1) {
pair_edges <- data.frame(combn(contained_nodes$index, 2) %>% t,
stringsAsFactors = FALSE) %>% plyr::rename(c(X1 = "from", X2 = "to"))
return(data.frame(pair_edges, pair = paste(sort(pair), collapse = ","),
stringsAsFactors = FALSE))
} else {
return(data.frame())
}
}) %>% rbind.fill
g <- tbl_graph(nodes = nodes, edges = edges, directed = TRUE)
return(g)
}
calculate_graph_measures <- function(g) {
ig <- as.undirected(as.igraph(g))
louvain_cluster <- cluster_louvain(ig)
#spinglass_cluster <- cluster_spinglass(ig) ## doesn't work on unconnected graphs
maximal_cliques <- maximal.cliques(ig)
graph_attributes <- list(
## Basic properties, i.e. # edges, # vertices
num_vertices=length(V(ig)),
num_edges=length(E(ig)),
num_simple_edges=length(E(simplify(ig))),
## Centralization measures
centr_degree=centr_degree(ig)$centralization,
centr_degree_max=centr_degree_tmax(ig),
centr_eigen=centr_eigen(ig)$centralization,
centr_eigen_max=centr_eigen_tmax(ig),
centr_clo=centr_clo(ig)$centralization,
centr_clo_tmax=centr_clo_tmax(ig),
centr_betw=centr_betw(ig, directed = FALSE)$centralization,
centr_betw_tmax=centr_betw_tmax(ig, directed = FALSE),
## Community measures
cluster_louvain_n=length(louvain_cluster),
#cluster_spinglass_n=length(spinglass_cluster),
cluster_louvain_size=unname(max(table(louvain_cluster$membership))),
#cluster_spinglass_size=unname(max(table(spinglass_cluster$membership))),
cluster_louvain_entropy=entropy::entropy(table(louvain_cluster$membership)),
cluster_louvain_simpson=vegan::diversity(table(louvain_cluster$membership), index = "invsimpson"),
## Global transitivity
transitivity_undirected=transitivity(ig, type = "globalundirected"),
## Assortativity
assortativity_degree=assortativity_degree(ig),
## Density
density=edge_density(ig),
## Cliques
maximal_cliques_n=maximal.cliques.count(ig),
maximal_clique_size=clique.number(ig),
maximal_clique_entropy=entropy::entropy(sapply(maximal_cliques, length)),
maximal_clique_simpson=vegan::diversity(sapply(maximal_cliques, length), index = "invsimpson"),
## Automorphisms
#automorphisms_n=as.numeric(automorphisms(ig)$group_size),
## Multiple edges
multiple_prop=length(which(igraph::count.multiple(ig) > 1))/length(igraph::count.multiple(ig))
)
return(graph_attributes)
}
plot_xcr_graph <- function(g) {
g %>% ggraph(layout = 'kk') + geom_edge_link(aes(colour=pair)) + geom_node_point() + scale_colour_continuous(guide = 'legend') + theme_graph()
}
segments <- unique(xcr_table$type)
patients <- sort(unique(xcr_table$patient_id))
seg_results <- lapply(segments, function(segtype) {
print(segtype)
pat_results <- lapply(patients, function(pat) {
print(pat)
xcr_clones <- subset(xcr_table, patient_id == pat & type == segtype)
g_small <- construct_xcr_graph(xcr_clones, nodes = "xcrclone", top_n = 50)
g_full <- construct_xcr_graph(xcr_clones, nodes = "xcrclone")
attributes <- calculate_graph_measures(g_full)
return(list(g_small = g_small, g_full = g_full, attributes = attributes))
})
names(pat_results) <- patients
return(pat_results)
})
names(seg_results) <- segments
saveRDS(seg_results, file = "~/shahlab/projects/ITH_Immune/paper/review/data_objects/xcr_graph_results.rds")
We don’t have time to wait for that to finish (it takes 30 minutes or so), so:
seg_results <- readRDS(graph_rds_path)
This is a graph subsampled from patient 2’s TCRs. These are the 50 most prevalent (according to read count) shared clonotypes.
plot_xcr_graph(seg_results$TRB$`2`$g_small)
graph_attribute_table <- lapply(names(seg_results), function(seg) {
x <- seg_results[[seg]]
patient_attribute_table <- lapply(names(x), function(pat) {
y <- x[[pat]]
data.frame(patient_id = pat, as.data.frame(y$attributes), stringsAsFactors = FALSE)
}) %>% rbind.fill
return(data.frame(type = seg, patient_attribute_table, stringsAsFactors = FALSE))
}) %>% rbind.fill
graph_attribute_table$patient_id <- ithi.meta::factor_id(graph_attribute_table$patient_id,
type = "patient_id", db_path)
graph_attribute_table$centr_degree_normalized <- with(graph_attribute_table,
centr_degree/centr_degree_max)
graph_attribute_table$centr_eigen_normalized <- with(graph_attribute_table,
centr_eigen/centr_eigen_max)
graph_attribute_table$centr_betw_normalized <- with(graph_attribute_table, centr_betw/centr_betw_tmax)
graph_attribute_table$centr_clo_normalized <- with(graph_attribute_table, centr_clo/centr_clo_tmax)
## NOTE that simpson = inverse_simpson here. Should add entropy and simpson
## for cluster_louvain
id_vars <- c("type", "patient_id")
measure_vars <- setdiff(colnames(graph_attribute_table), id_vars)
graph_attribute_table_melted <- reshape2::melt(graph_attribute_table, id.vars = id_vars,
measure.vars = measure_vars, variable.name = "measure", value.name = "value")
ihc_table_melted <- reshape2::melt(ihc_table, id.vars = c("condensed_id", "patient_id"),
measure.vars = all_tiltypes, variable.name = "tiltype", value.name = "density")
xcr_samples <- xcr_table$condensed_id %>% unique
ihc_table_melted <- subset(ihc_table_melted, condensed_id %in% xcr_samples)
ihc_table_means <- ihc_table_melted %>% group_by(patient_id, tiltype) %>% summarise(density = mean(density,
na.rm = TRUE))
ihc_table_means$patient_id <- ithi.meta::factor_id(ihc_table_means$patient_id,
type = "patient_id", db_path)
ihc_graph <- plyr::join(ihc_table_means %>% as.data.frame, graph_attribute_table_melted %>%
as.data.frame, by = "patient_id")
pvals <- ithi.utils::compute_pvals_subsets(ihc_graph, facet_vars = c("tiltype",
"type", "measure"), formula = ~density + value, corfun = cor.test, method = "spearman")
subset(pvals, p.adj < 0.05)
ggplot(ihc_graph %>% subset(type == "TRB" & tiltype == "E_CD8_density" & measure ==
"cluster_louvain_size"), aes(x = density, y = value)) + geom_point(aes(colour = factor(patient_id))) +
scale_color_manual(values = annotation_colours$patient_id) + theme_bw() +
theme_Publication() + theme_nature()
It seems like several of these graph measures correlate very well with each TIL type’s density.
with(subset(pvals, p.adj < 0.05), table(measure))
measure
num_vertices num_edges num_simple_edges
10 12 12
centr_degree centr_degree_max centr_eigen
8 10 0
centr_eigen_max centr_clo centr_clo_tmax
10 3 10
centr_betw centr_betw_tmax cluster_louvain_n
0 10 10
cluster_louvain_size cluster_louvain_entropy cluster_louvain_simpson
12 6 7
transitivity_undirected assortativity_degree density
1 1 6
maximal_cliques_n maximal_clique_size maximal_clique_entropy
10 11 0
maximal_clique_simpson multiple_prop centr_degree_normalized
0 0 6
centr_eigen_normalized centr_betw_normalized centr_clo_normalized
10 3 10
And these seem to be the same measures. Simple measures like the number of vertices (total # unique clonotypes for a patient) and the number of edges (number of pairwise ‘sharings’ between samples for the same clonotype – this is biased by # of samples per patient) do very well.
In terms of what seems to be important:
I think it’s safe to say that ‘simple’ measures like the number of vertices/edges are better predictors of TIL density. That explains as well why TCR/BCR repertoire similarity (which is effectively related to the number of edges) does so well in correlations with TIL density.
We might be interested in predicting whether or not a patient is ES_pure, ES_mixed, or ES_none – since we think these potentially have prognostic implications.
range01 <- function(tab, ...) {
apply(tab, 2, function(x) (x - min(x, ...))/(max(x, ...) - min(x, ...)))
}
til_clusters_subset <- til_clusters %>% subset(condensed_id %in% xcr_samples)
recurrence_samples <- c("7_BrnM", "7_RPvM", "11_Pv1", "11_Rct1", "11_Rct2",
"23_LOv1")
til_clusters_subset <- subset(til_clusters_subset, !condensed_id %in% recurrence_samples)
es_subtypes <- til_clusters_subset %>% group_by(patient_id) %>% summarise(ES_pure = all(cluster ==
"ES-TIL"), ES_none = !any(cluster == "ES-TIL"), ES_mixed = (any(cluster ==
"ES-TIL") & !all(cluster == "ES-TIL")))
es_subtypes$subtype <- colnames(es_subtypes)[2:4][sapply(1:nrow(es_subtypes),
function(i) which(as.logical(es_subtypes[i, 2:4])))]
es_subtypes$subtype_simple <- es_subtypes$subtype %>% plyr::mapvalues(c("ES_pure",
"ES_mixed", "ES_none"), c("ES", "ES", "notES"))
feature_predictors <- c("centr_eigen_normalized", "density", "cluster_louvain_size",
"maximal_clique_simpson", "transitivity_undirected", "assortativity_degree")
feature_list <- setdiff(feature_predictors, id_vars)
formula_str <- paste(feature_list, collapse = " + ")
trb_features <- subset(graph_attribute_table, type == "TRB", select = -c(type))
es_subtypes_trb <- plyr::join(trb_features, es_subtypes, by = "patient_id")
es_subtypes_trb <- es_subtypes_trb %>% subset(!is.na(subtype))
es_subtypes_trb[, feature_list] <- range01(es_subtypes_trb[, feature_list])
es_subtypes_trb_multinom <- nnet::multinom(formula = as.formula(paste0("subtype ~ ",
formula_str)), data = es_subtypes_trb, decay = 0.01)
# weights: 24 (14 variable)
initial value 19.775021
iter 10 value 11.815428
iter 20 value 11.732515
iter 30 value 11.731501
iter 40 value 11.731469
final value 11.731468
converged
es_subtypes_trb_multinom
Call:
nnet::multinom(formula = as.formula(paste0("subtype ~ ", formula_str)),
data = es_subtypes_trb, decay = 0.01)
Coefficients:
(Intercept) centr_eigen_normalized density cluster_louvain_size
ES_none 0.07050054 3.465603 -1.606186 -2.2384131
ES_pure 0.28390962 -2.233819 1.578687 0.5057187
maximal_clique_simpson transitivity_undirected
ES_none 0.1940997 4.032051
ES_pure 1.5492960 -3.632376
assortativity_degree
ES_none -2.473398
ES_pure -1.963789
Residual Deviance: 23.46294
AIC: 51.46294
es_subtypes_trb_multinom$fitted.values %>% apply(1, which.max)
1 2 3 4 6 7 8 10 11 12 13 14 15 16 17 19 20 21
3 2 2 1 3 2 2 2 2 2 3 2 2 2 2 2 3 2
When decay isn’t added, seems to be overfitting. Rarely predicts ES-mixed (class 1) if decay is added.
varImp(es_subtypes_trb_multinom) %>% rownames_to_column(var = "feature")
Transitivity does the best, which is interesting. Also, we tend to see LOWER transitivity in patients with higher TIL.
So let’s do this more rigorously. Using 4-fold cross validation, 5 times, to fit a multinomial model:
message("Training multinomial model ...")
fitControl <- trainControl(method = "repeatedcv", number = 4, repeats = 5, classProbs = TRUE,
summaryFunction = multiClassSummary)
nnetGrid <- expand.grid(decay = 10^seq(from = -5, to = 1, by = 1)) ## regularization
nnetFit <- train(form = as.formula(paste0("subtype ~ ", formula_str)), data = es_subtypes_trb %>%
select(c(feature_list, "subtype")) %>% mutate(subtype = factor(subtype)),
method = "multinom", metric = "Accuracy", maximize = TRUE, trControl = fitControl,
tuneGrid = nnetGrid, verbose = TRUE)
nnetFit
Penalized Multinomial Regression
18 samples
6 predictor
3 classes: 'ES_mixed', 'ES_none', 'ES_pure'
No pre-processing
Resampling: Cross-Validated (4 fold, repeated 5 times)
Summary of sample sizes: 14, 14, 13, 13, 13, 13, ...
Resampling results across tuning parameters:
decay logLoss AUC prAUC Accuracy Kappa
1e-05 9.8113597 0.5222222 0.2480556 0.3950 -9.469697e-05
1e-04 5.7205051 0.5250000 0.2475926 0.3925 -6.721681e-03
1e-03 2.4503777 0.5722222 0.2355093 0.4475 8.145202e-02
1e-02 1.1853415 0.6902778 0.2271759 0.5400 1.769544e-01
1e-01 0.8653243 0.7263889 0.2271296 0.6025 2.302976e-01
1e+00 0.9373209 0.7180556 0.2331944 0.5700 5.833333e-02
1e+01 1.0513313 0.7250000 0.2315741 0.5500 0.000000e+00
Mean_F1 Mean_Sensitivity Mean_Specificity Mean_Pos_Pred_Value
0.6111111 0.3250000 0.6777778 0.4047619
0.6111111 0.3222222 0.6763889 0.4047619
NaN 0.3555556 0.7111111 0.3796296
NaN 0.4250000 0.7347222 0.4000000
NaN 0.4694444 0.7416667 NaN
NaN 0.3666667 0.6833333 NaN
NaN 0.3333333 0.6666667 NaN
Mean_Neg_Pred_Value Mean_Precision Mean_Recall Mean_Detection_Rate
0.6731481 0.4047619 0.3250000 0.1316667
0.6623457 0.4047619 0.3222222 0.1308333
0.7101852 0.3796296 0.3555556 0.1491667
0.7869048 0.4000000 0.4250000 0.1800000
0.8192308 NaN 0.4694444 0.2008333
0.9333333 NaN 0.3666667 0.1900000
NaN NaN 0.3333333 0.1833333
Mean_Balanced_Accuracy
0.5013889
0.4993056
0.5333333
0.5798611
0.6055556
0.5250000
0.5000000
Accuracy was used to select the optimal model using the largest value.
The final value used for the model was decay = 0.1.
1e-05
1e-04
1e-03
1e-02
1e-01
1e+00
1e+01
9.8113597
5.7205051
2.4503777
1.1853415
0.8653243
0.9373209
1.0513313
0.5222222
0.5250000
0.5722222
0.6902778
0.7263889
0.7180556
0.7250000
0.2480556
0.2475926
0.2355093
0.2271759
0.2271296
0.2331944
0.2315741
0.3950
0.3925
0.4475
0.5400
0.6025
0.5700
0.5500
-9.469697e-05
-6.721681e-03
8.145202e-02
1.769544e-01
2.302976e-01
5.833333e-02
0.000000e+00
0.6111111
0.6111111
NaN
NaN
NaN
NaN
NaN
0.3250000
0.3222222
0.3555556
0.4250000
0.4694444
0.3666667
0.3333333
0.6777778
0.6763889
0.7111111
0.7347222
0.7416667
0.6833333
0.6666667
0.4047619
0.4047619
0.3796296
0.4000000
NaN
NaN
NaN
0.6731481
0.6623457
0.7101852
0.7869048
0.8192308
0.9333333
NaN
0.4047619
0.4047619
0.3796296
0.4000000
NaN
NaN
NaN
0.3250000
0.3222222
0.3555556
0.4250000
0.4694444
0.3666667
0.3333333
0.1316667
0.1308333
0.1491667
0.1800000
0.2008333
0.1900000
0.1833333
0.5013889
0.4993056
0.5333333
0.5798611
0.6055556
0.5250000
0.5000000
We’re not doing so well here. Our best model has an accuracy around 0.6. Looking at the results, it consistently has problems with predicting the ES-mixed cluster, indicating that it can’t really get down to that ‘deep’ of a level. What if we only want to look at 2 clusters – either ES_pure/ES_mixed or ES_none?
message("Training logistic model ...")
fitControl <- trainControl(method = "repeatedcv", number = 4, repeats = 5, classProbs = TRUE,
summaryFunction = twoClassSummary)
plrGrid <- expand.grid(lambda = 10^seq(from = -5, to = 5, by = 1), cp = c("aic",
"bic")) ## regularization
plrFit <- train(form = as.formula(paste0("subtype_simple ~ ", formula_str)),
data = es_subtypes_trb %>% select(c(feature_list, "subtype_simple")) %>%
mutate(subtype_simple = factor(subtype_simple)), method = "plr", metric = "ROC",
maximize = TRUE, trControl = fitControl, tuneGrid = plrGrid)
plrFit
Penalized Logistic Regression
18 samples
6 predictor
2 classes: 'ES', 'notES'
No pre-processing
Resampling: Cross-Validated (4 fold, repeated 5 times)
Summary of sample sizes: 14, 13, 13, 14, 13, 14, ...
Resampling results across tuning parameters:
lambda cp ROC Sens Spec
1e-05 aic 0.5291667 0.600 0.5250000
1e-05 bic 0.5291667 0.600 0.5250000
1e-04 aic 0.5916667 0.600 0.5166667
1e-04 bic 0.5916667 0.600 0.5166667
1e-03 aic 0.6875000 0.650 0.6083333
1e-03 bic 0.6875000 0.650 0.6083333
1e-02 aic 0.7666667 0.550 0.6666667
1e-02 bic 0.7666667 0.550 0.6666667
1e-01 aic 0.8291667 0.475 0.8083333
1e-01 bic 0.8291667 0.475 0.8083333
1e+00 aic 0.8750000 0.300 0.9083333
1e+00 bic 0.8750000 0.300 0.9083333
1e+01 aic 0.8750000 0.000 1.0000000
1e+01 bic 0.8750000 0.000 1.0000000
1e+02 aic 0.8750000 0.000 1.0000000
1e+02 bic 0.8750000 0.000 1.0000000
1e+03 aic 0.8750000 0.000 1.0000000
1e+03 bic 0.8750000 0.000 1.0000000
1e+04 aic 0.8750000 0.000 1.0000000
1e+04 bic 0.8750000 0.000 1.0000000
1e+05 aic 0.8750000 0.000 1.0000000
1e+05 bic 0.8750000 0.000 1.0000000
ROC was used to select the optimal model using the largest value.
The final values used for the model were lambda = 1e+05 and cp = aic.
1e-05
1e-05
1e-04
1e-04
1e-03
1e-03
1e-02
1e-02
1e-01
1e-01
1e+00
1e+00
1e+01
1e+01
1e+02
1e+02
1e+03
1e+03
1e+04
1e+04
1e+05
1e+05
aic
bic
aic
bic
aic
bic
aic
bic
aic
bic
aic
bic
aic
bic
aic
bic
aic
bic
aic
bic
aic
bic
0.5291667
0.5291667
0.5916667
0.5916667
0.6875000
0.6875000
0.7666667
0.7666667
0.8291667
0.8291667
0.8750000
0.8750000
0.8750000
0.8750000
0.8750000
0.8750000
0.8750000
0.8750000
0.8750000
0.8750000
0.8750000
0.8750000
0.600
0.600
0.600
0.600
0.650
0.650
0.550
0.550
0.475
0.475
0.300
0.300
0.000
0.000
0.000
0.000
0.000
0.000
0.000
0.000
0.000
0.000
0.5250000
0.5250000
0.5166667
0.5166667
0.6083333
0.6083333
0.6666667
0.6666667
0.8083333
0.8083333
0.9083333
0.9083333
1.0000000
1.0000000
1.0000000
1.0000000
1.0000000
1.0000000
1.0000000
1.0000000
1.0000000
1.0000000
Yeah, this does pretty bad at predicting TIL subtype too – and that’s just with binary classes.
So the graph measures we’re using, i.e.
feature_list
[1] "centr_eigen_normalized" "density"
[3] "cluster_louvain_size" "maximal_clique_simpson"
[5] "transitivity_undirected" "assortativity_degree"
centr_eigen_normalized
density
cluster_louvain_size
maximal_clique_simpson
transitivity_undirected
assortativity_degree
aren’t very good predictors of TIL subtype.