library(ithi.utils)
load_base_libs()
library(ithi.meta)
library(ithi.figures)
library(ithi.expr)
library(ithi.external)
library(ithi.lohhla)
lohhla_icgc_outdir <- snakemake@input$lohhla_icgc_outdir
ith_icgc_bc_file <- snakemake@input$ith_icgc_bc
icgc_specimen_file <- snakemake@input$icgc_specimen
nanostring_annotations_path <- snakemake@input$nanostring_annotations
icgc_subtype_file <- snakemake@input$icgc_subtypes
icgc_expr_mat_file <- snakemake@input$icgc_expr_mat
lohhla_tcga_outdir <- snakemake@input$lohhla_tcga_outdir
tcga_ov_annotation_file <- snakemake@input$tcga_ov_annotations
nmf_subtype_file <- snakemake@input$array_nmf_subtypes
tcga_paper_subtype_file <- snakemake@input$tcga_paper_subtypes_raw
tcga_paper_silhouette_file <- snakemake@input$tcga_paper_subtypes_possilhouette
tcga_ov_bam_dir <- snakemake@input$tcga_ov_bam_dir
tcga_nonstd_expr_mat_file <- snakemake@input$tcga_nonstd_expr
db_path <- snakemake@params$db
supportive_site_threshold <- as.numeric(snakemake@params$lohhla_supportive_site_threshold)
mismatch_site_threshold <- as.numeric(snakemake@params$lohhla_mismatch_site_threshold)
icgc_specimen <- fread(icgc_specimen_file)
ith_icgc_bc <- fread(ith_icgc_bc_file)
icgc_subtypes <- fread(icgc_subtype_file)
icgc_expr_mat <- fread(icgc_expr_mat_file)
nmf_subtypes <- read_tcga_nmf_subtypes(nmf_subtype_file)
tcga_paper_subtypes <- read_tcga_paper_subtypes(tcga_paper_subtype_file, tcga_paper_silhouette_file)
tcga_nonstd_expr_mat <- fread(tcga_nonstd_expr_mat_file)
Read 0.0% of 570 rows
Read 570 rows and 12439 (of 12439) columns from 0.112 GB file in 00:00:06
tcga_ov_annotations <- fread(tcga_ov_annotation_file)
nanostring_labels <- fread(nanostring_annotations_path)
nanostring_labels_modified <- read_nanostring_labels_modified(nanostring_annotations_path)
tcga_bam_manifest <- create_bam_manifest(tcga_ov_bam_dir, ucec_format = FALSE,
local = FALSE)
Here we’re going to run LOHHLA on ICGC and TCGA.
A few things to note:
icgc_hlaloss_table <- extract_lohhla_table(lohhla_icgc_outdir)
icgc_loss_table <- construct_loss_table(icgc_hlaloss_table, type = "ICGC", supportive_site_threshold = supportive_site_threshold,
mismatch_site_threshold = mismatch_site_threshold)
icgc_loss_summarized <- summarize_loss_table(icgc_loss_table, type = "ICGC")
table(icgc_loss_summarized$loh)
FALSE TRUE
36 17
icgc_loss_summarized_subtypes <- merge(icgc_loss_summarized, icgc_subtypes,
by.x = c("sample_key"), by.y = c("icgc_donor_id"))
with(icgc_loss_summarized_subtypes, table(loh, subtype))
subtype
loh C1 C2 C4 C5
FALSE 14 10 8 3
TRUE 9 3 2 2
with(icgc_loss_summarized_subtypes, table(loh, nmf_subtype))
nmf_subtype
loh C1 C2 C4 C5
FALSE 12 6 6 12
TRUE 9 3 0 4
fisher.test(with(icgc_loss_summarized_subtypes, table(loh, nmf_subtype %in%
c("C1", "C2"))))
Fisher's Exact Test for Count Data
data:
p-value = 0.1311
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
0.7111636 14.9334176
sample estimates:
odds ratio
2.938272
subset(icgc_subtypes, icgc_donor_id %in% c("DO46388", "DO46390", "DO46571"))
The other 4 samples have no suitable HLA alleles (i.e. they’re homozygous at all 3 HLA loci). These 3 samples had the error that is due to no reads mapping to one of the alleles. If these samples are true ‘LOH’, their molecular subtype assignments are not surprising in light of the previous results – 2/3 are C1 samples, by either classification.
icgc_pathway_matrix <- ithi.expr::create_pathway_matrix(ith_icgc_bc, nanostring_labels,
db_path, convert_ids = FALSE, summary_method = "arithmetic_mean")
icgc_pathway_matrix_summarized <- summarize_expression_by_patient(icgc_pathway_matrix,
icgc_specimen)
icgc_pathway_df <- icgc_pathway_matrix_summarized %>% tibble::column_to_rownames(var = "Name") %>%
as.matrix %>% melt %>% plyr::rename(c(Var1 = "pathway", Var2 = "sample_key",
value = "mean_expr"))
icgc_loss_summarized_expr <- merge(icgc_loss_summarized, icgc_pathway_df)
df <- icgc_loss_summarized_expr # %>% subset(pathway %in% c('T-Cell Functions', 'B-Cell Functions', 'Cytotoxicity'))
pvals <- compute_pvals_subsets(df, facet_vars = c("pathway"), corfun = wilcox.test,
formula = mean_expr ~ loh)
ggplot(df, aes(x = loh, y = mean_expr)) + geom_boxplot(outlier.size = -1) +
geom_jitter(position = position_jitter(width = 0.2, height = 0), alpha = 0.5) +
theme_bw() + facet_wrap(~pathway, scales = "free", ncol = 3) + theme_Publication() +
theme_nature() + geom_text(data = pvals, aes(x = Inf, y = Inf, label = p.value.text),
hjust = 1.1, vjust = 1.5, size = 2.5, parse = TRUE)
We can see a trend, but it’s not significant – likely due to sample size (see first section for sample size numbers).
So we see a tendency towards higher T-cell-associated gene expression among samples with LOHHLA, though this is not significant.
At the moment I’m only using those samples for which we have ploidy/cellularity predictions from ASCAT from Nicolai Birkbak – that was from an older version of ASCAT, etc.
I still have to run the newer ASCAT pipeline – ran into a few problems with time earlier that I’ll probably have to resolve by making the ASCAT package more efficient. Got halfway through this then got sidetracked by other analysis …
So these results are really on about 2/3 to 3/4 of the TCGA-OV cohort.
tcga_hlaloss_table <- extract_lohhla_table(lohhla_tcga_outdir)
tcga_hlaloss_table$tcga_sample_id <- stringr::str_extract(tcga_hlaloss_table$region,
"TCGA\\-[0-9A-Z]{2}\\-[0-9A-Z]{4}")
tcga_hlaloss_table$analyte_type <- stringr::str_extract(tcga_hlaloss_table$region,
"(?<=TCGA\\-[0-9A-Z]{2}\\-[0-9A-Z]{4}\\-[0-9A-Z]{3}\\-[0-9A-Z]{2})[A-Z]")
# tcga_hlaloss_table <- subset(tcga_hlaloss_table, analyte_type == 'D')
hist(tcga_hlaloss_table$HLA_type1copyNum_withBAF, breaks = 30)
hist(tcga_hlaloss_table$HLA_type2copyNum_withBAF, breaks = 30)
tcga_loss_table <- construct_loss_table(tcga_hlaloss_table, type = "TCGA", supportive_site_threshold = supportive_site_threshold,
mismatch_site_threshold = mismatch_site_threshold)
max_plate <- tcga_loss_table %>% group_by(tcga_sample_id) %>% summarise(max_plate = max(as.numeric(plate)))
tcga_loss_table <- tcga_loss_table %>% plyr::join(max_plate)
tcga_loss_table <- subset(tcga_loss_table, as.numeric(plate) == max_plate)
tcga_loss_summarized <- summarize_loss_table(tcga_loss_table, type = "TCGA")
tcga_manifest_data <- tcga_bam_manifest %>% subset(specimen_type == "diseased",
select = c(center, patient, barcode, sample_key, short_barcode))
nmf_subtypes_merged <- Reduce(function(x, y) plyr::join(x, y, type = "full"),
list(nmf_subtypes, tcga_manifest_data, tcga_paper_subtypes)) %>% plyr::rename(c(subtype = "nmf_subtype",
secondary = "nmf_secondary_subtype")) %>% subset(select = -c(tcga_sample_id))
# nmf_subtypes_merged_noduplicates <-
# nmf_subtypes_merged[!duplicated(nmf_subtypes_merged$short_barcode),]
nmf_subtypes_merged$tcga_sample_id <- nmf_subtypes_merged$patient
nmf_subtypes_merged_noduplicates <- nmf_subtypes_merged
nmf_subtypes_merged_noduplicates <- subset(nmf_subtypes_merged_noduplicates,
!is.na(sample_key)) %>% subset(select = -c(barcode, sample_key, center)) %>%
unique
nmf_subtypes_merged_noduplicates <- nmf_subtypes_merged_noduplicates[!duplicated(nmf_subtypes_merged_noduplicates$patient),
]
tcga_loss_summarized$analysis_center <- str_extract(tcga_loss_summarized$sample_key,
"(Non\\-)?Broad")
tcga_loss_annotated <- Reduce(function(x, y) plyr::join(x, y, type = "full"),
list(tcga_loss_summarized %>% as.data.frame, tcga_ov_annotations, nmf_subtypes_merged_noduplicates))
with(tcga_loss_annotated, table(MolecularSubtype, loss))
loss
MolecularSubtype FALSE TRUE
Differentiated 61 0
Immunoreactive 61 3
Mesenchymal 66 0
Proliferative 65 2
with(tcga_loss_annotated, table(MolecularSubtype, loss))/rowSums(with(tcga_loss_annotated,
table(MolecularSubtype, loss)))
loss
MolecularSubtype FALSE TRUE
Differentiated 1.00000000 0.00000000
Immunoreactive 0.95312500 0.04687500
Mesenchymal 1.00000000 0.00000000
Proliferative 0.97014925 0.02985075
fisher.test(with(tcga_loss_annotated, table(MolecularSubtype %in% c("Immunoreactive",
"Mesenchymal"), loss)))
Fisher's Exact Test for Count Data
data:
p-value = 0.3547
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
0.3061054 32.8301221
sample estimates:
odds ratio
2.708565
with(tcga_loss_annotated, table(MolecularSubtype, loh))
loh
MolecularSubtype FALSE TRUE
Differentiated 41 20
Immunoreactive 45 19
Mesenchymal 44 22
Proliferative 45 22
with(tcga_loss_annotated, table(MolecularSubtype, loh))/rowSums(with(tcga_loss_annotated,
table(MolecularSubtype, loh)))
loh
MolecularSubtype FALSE TRUE
Differentiated 0.6721311 0.3278689
Immunoreactive 0.7031250 0.2968750
Mesenchymal 0.6666667 0.3333333
Proliferative 0.6716418 0.3283582
## Not technically a primary site filter here -- this is just a simple Wang
## et al. filter
filter_primary <- FALSE
dat <- tcga_loss_annotated
if (filter_primary) {
dat <- dat %>% subset(!is.na(MolecularSubtype))
}
with(dat, table(nmf_subtype, loss))
loss
nmf_subtype FALSE TRUE
C1 62 0
C2 97 3
C4 101 0
C5 94 2
with(dat, table(nmf_subtype, loss))/rowSums(with(dat, table(nmf_subtype, loss)))
loss
nmf_subtype FALSE TRUE
C1 1.00000000 0.00000000
C2 0.97000000 0.03000000
C4 1.00000000 0.00000000
C5 0.97916667 0.02083333
with(dat, table(nmf_subtype, loh))
loh
nmf_subtype FALSE TRUE
C1 42 20
C2 72 28
C4 73 28
C5 63 33
with(dat, table(nmf_subtype, loh))/rowSums(with(dat, table(nmf_subtype, loh)))
loh
nmf_subtype FALSE TRUE
C1 0.6774194 0.3225806
C2 0.7200000 0.2800000
C4 0.7227723 0.2772277
C5 0.6562500 0.3437500
with(dat %>% subset(sil_width > 0), table(nmf_subtype, loss))
loss
nmf_subtype FALSE TRUE
C1 62 0
C2 88 3
C4 68 0
C5 65 2
with(dat %>% subset(sil_width > 0), table(nmf_subtype, loss))/rowSums(with(dat %>%
subset(sil_width > 0), table(nmf_subtype, loss)))
loss
nmf_subtype FALSE TRUE
C1 1.00000000 0.00000000
C2 0.96703297 0.03296703
C4 1.00000000 0.00000000
C5 0.97014925 0.02985075
with(dat %>% subset(sil_width > 0), table(nmf_subtype, loh))
loh
nmf_subtype FALSE TRUE
C1 42 20
C2 64 27
C4 48 20
C5 45 22
with(dat %>% subset(sil_width > 0), table(nmf_subtype, loh))/rowSums(with(dat %>%
subset(sil_width > 0), table(nmf_subtype, loh)))
loh
nmf_subtype FALSE TRUE
C1 0.6774194 0.3225806
C2 0.7032967 0.2967033
C4 0.7058824 0.2941176
C5 0.6716418 0.3283582
tcga_nonstd_expr_df <- tcga_nonstd_expr_mat %>% as.data.frame %>% tibble::column_to_rownames("tcga_sample_id") %>%
t %>% as.data.frame %>% tibble::rownames_to_column("Name")
tcga_pathway_matrix_amean <- ithi.expr::create_pathway_matrix(tcga_nonstd_expr_df,
nanostring_labels_modified, db_path, convert_ids = FALSE, summary_method = "arithmetic_mean")
tcga_pathway_matrix_gmean <- ithi.expr::create_pathway_matrix(tcga_nonstd_expr_df,
nanostring_labels_modified, db_path, convert_ids = FALSE, summary_method = "geometric_mean")
tcga_pathway_matrix_nano <- tcga_pathway_matrix_amean[setdiff(rownames(tcga_pathway_matrix_amean),
"Rooney_Cytotoxicity"), ]
tcga_pathway_matrix_rooney <- tcga_pathway_matrix_gmean["Rooney_Cytotoxicity",
, drop = FALSE]
tcga_pathway_matrix <- rbind(tcga_pathway_matrix_nano, tcga_pathway_matrix_rooney)
tcga_pathway_df <- tcga_pathway_matrix %>% as.matrix %>% melt %>% plyr::rename(c(Var2 = "short_barcode",
Var1 = "pathway", value = "expr"))
tcga_pathway_df_annotated <- plyr::join(tcga_pathway_df, nmf_subtypes_merged_noduplicates)
p <- ggplot(tcga_pathway_df_annotated, aes(x = nmf_subtype, y = expr)) + geom_boxplot() +
theme_bw() + theme_Publication() + theme_nature() + facet_wrap(~pathway,
scales = "free", ncol = 2)
p
C5 is clearly the lowest, below C4.
tcga_pathway_loss <- tcga_pathway_df_annotated %>% as.data.frame %>% plyr::join(tcga_loss_annotated)
df <- tcga_pathway_loss %>% subset(!is.na(loh))
df$center <- str_extract(df$tcga_sample_id, "(?<=\\-)[0-9A-Z]{2}(?=\\-)")
pvals <- ithi.utils::compute_pvals_subsets(df, facet_vars = c("pathway"), formula = expr ~
loh, corfun = wilcox.test)
p <- ggplot(df, aes(x = loh, y = expr)) + geom_boxplot() + theme_bw() + theme_Publication() +
theme_nature() + facet_wrap(~pathway, scales = "free", ncol = 2) + geom_text(data = pvals,
aes(x = Inf, y = Inf, label = p.value.text), hjust = 1.1, vjust = 1.5, size = 2.5,
parse = TRUE)
p
I also did this with TCGA’s RNA-seq data in another file and found nothing.
df_subset <- subset(df, pathway == "Rooney_Cytotoxicity" & !is.na(loh) & !is.na(center))
pvals <- ithi.utils::compute_pvals_subsets(df_subset, facet_vars = c("center"),
formula = expr ~ loh, corfun = wilcox.test)
p <- ggplot(df_subset, aes(x = loh, y = expr)) + geom_boxplot(outlier.size = -1) +
geom_jitter(position = position_jitter(width = 0.2, height = 0), alpha = 0.5) +
theme_bw() + theme_Publication() + theme_nature() + facet_wrap(~center,
scales = "free", ncol = 2) + geom_text(data = pvals, aes(x = Inf, y = Inf,
label = p.value.text), hjust = 1.1, vjust = 1.5, size = 2.5, parse = TRUE)
table(df_subset$center)
04 09 10 13 20 23 24 25 29 30 31 36 42 57 59 61
26 15 9 75 9 29 60 12 35 6 3 14 7 3 3 47
p
The UPenn center shows something, but the other 2 big ones – Broad and MSKCC – do not.
with(dat %>% subset(paper_pos_silhouette), table(tcga_paper_subtype, loss))
loss
tcga_paper_subtype FALSE TRUE
Differentiated 61 0
Immunoreactive 60 3
Mesenchymal 66 0
Proliferative 65 2
with(dat %>% subset(paper_pos_silhouette), table(tcga_paper_subtype, loss))/rowSums(with(dat %>%
subset(paper_pos_silhouette), table(tcga_paper_subtype, loss)))
loss
tcga_paper_subtype FALSE TRUE
Differentiated 1.00000000 0.00000000
Immunoreactive 0.95238095 0.04761905
Mesenchymal 1.00000000 0.00000000
Proliferative 0.97014925 0.02985075
with(dat %>% subset(paper_pos_silhouette), table(tcga_paper_subtype, loh))
loh
tcga_paper_subtype FALSE TRUE
Differentiated 41 20
Immunoreactive 44 19
Mesenchymal 45 21
Proliferative 45 22
with(dat %>% subset(paper_pos_silhouette), table(tcga_paper_subtype, loh))/rowSums(with(dat %>%
subset(paper_pos_silhouette), table(tcga_paper_subtype, loh)))
loh
tcga_paper_subtype FALSE TRUE
Differentiated 0.6721311 0.3278689
Immunoreactive 0.6984127 0.3015873
Mesenchymal 0.6818182 0.3181818
Proliferative 0.6716418 0.3283582
with(dat, table(Subgroup, loss))
loss
Subgroup FALSE TRUE
FBI-AMP High 140 2
FBI-AMP Low 161 1
No AMP 54 2
with(dat, table(Subgroup, loh))
loh
Subgroup FALSE TRUE
FBI-AMP High 98 44
FBI-AMP Low 111 51
No AMP 41 15
Not really any difference betwen foldback subsets in terms of this.