Setup

library(ithi.utils)
load_base_libs()
library(ithi.meta)
library(ithi.figures)
library(ithi.expr)
library(ithi.external)
library(ithi.lohhla)
lohhla_icgc_outdir <- snakemake@input$lohhla_icgc_outdir
ith_icgc_bc_file <- snakemake@input$ith_icgc_bc
icgc_specimen_file <- snakemake@input$icgc_specimen
nanostring_annotations_path <- snakemake@input$nanostring_annotations
icgc_subtype_file <- snakemake@input$icgc_subtypes
icgc_expr_mat_file <- snakemake@input$icgc_expr_mat

lohhla_tcga_outdir <- snakemake@input$lohhla_tcga_outdir
tcga_ov_annotation_file <- snakemake@input$tcga_ov_annotations
nmf_subtype_file <- snakemake@input$array_nmf_subtypes
tcga_paper_subtype_file <- snakemake@input$tcga_paper_subtypes_raw
tcga_paper_silhouette_file <- snakemake@input$tcga_paper_subtypes_possilhouette
tcga_ov_bam_dir <- snakemake@input$tcga_ov_bam_dir
tcga_nonstd_expr_mat_file <- snakemake@input$tcga_nonstd_expr

db_path <- snakemake@params$db
supportive_site_threshold <- as.numeric(snakemake@params$lohhla_supportive_site_threshold)
mismatch_site_threshold <- as.numeric(snakemake@params$lohhla_mismatch_site_threshold)
icgc_specimen <- fread(icgc_specimen_file)
ith_icgc_bc <- fread(ith_icgc_bc_file)
icgc_subtypes <- fread(icgc_subtype_file)
icgc_expr_mat <- fread(icgc_expr_mat_file)

nmf_subtypes <- read_tcga_nmf_subtypes(nmf_subtype_file)
tcga_paper_subtypes <- read_tcga_paper_subtypes(tcga_paper_subtype_file, tcga_paper_silhouette_file)
tcga_nonstd_expr_mat <- fread(tcga_nonstd_expr_mat_file)

Read 0.0% of 570 rows
Read 570 rows and 12439 (of 12439) columns from 0.112 GB file in 00:00:06
tcga_ov_annotations <- fread(tcga_ov_annotation_file)

nanostring_labels <- fread(nanostring_annotations_path)
nanostring_labels_modified <- read_nanostring_labels_modified(nanostring_annotations_path)

tcga_bam_manifest <- create_bam_manifest(tcga_ov_bam_dir, ucec_format = FALSE, 
    local = FALSE)

Here we’re going to run LOHHLA on ICGC and TCGA.

A few things to note:

  • ICGC is lower sequencing depth
  • TCGA-OV was the earliest TCGA study and is not as high-quality as the studies after it – comparisons may need to be taken with a pinch of salt

ICGC

icgc_hlaloss_table <- extract_lohhla_table(lohhla_icgc_outdir)
icgc_loss_table <- construct_loss_table(icgc_hlaloss_table, type = "ICGC", supportive_site_threshold = supportive_site_threshold, 
    mismatch_site_threshold = mismatch_site_threshold)
icgc_loss_summarized <- summarize_loss_table(icgc_loss_table, type = "ICGC")
table(icgc_loss_summarized$loh)

FALSE  TRUE 
   36    17 

vs. Molecular subtypes

icgc_loss_summarized_subtypes <- merge(icgc_loss_summarized, icgc_subtypes, 
    by.x = c("sample_key"), by.y = c("icgc_donor_id"))

with(icgc_loss_summarized_subtypes, table(loh, subtype))
       subtype
loh     C1 C2 C4 C5
  FALSE 14 10  8  3
  TRUE   9  3  2  2
with(icgc_loss_summarized_subtypes, table(loh, nmf_subtype))
       nmf_subtype
loh     C1 C2 C4 C5
  FALSE 12  6  6 12
  TRUE   9  3  0  4
fisher.test(with(icgc_loss_summarized_subtypes, table(loh, nmf_subtype %in% 
    c("C1", "C2"))))

    Fisher's Exact Test for Count Data

data:  
p-value = 0.1311
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
  0.7111636 14.9334176
sample estimates:
odds ratio 
  2.938272 

Missing samples

subset(icgc_subtypes, icgc_donor_id %in% c("DO46388", "DO46390", "DO46571"))

The other 4 samples have no suitable HLA alleles (i.e. they’re homozygous at all 3 HLA loci). These 3 samples had the error that is due to no reads mapping to one of the alleles. If these samples are true ‘LOH’, their molecular subtype assignments are not surprising in light of the previous results – 2/3 are C1 samples, by either classification.

vs. Expression patterns

Batch corrected file with our cohort

icgc_pathway_matrix <- ithi.expr::create_pathway_matrix(ith_icgc_bc, nanostring_labels, 
    db_path, convert_ids = FALSE, summary_method = "arithmetic_mean")

icgc_pathway_matrix_summarized <- summarize_expression_by_patient(icgc_pathway_matrix, 
    icgc_specimen)

icgc_pathway_df <- icgc_pathway_matrix_summarized %>% tibble::column_to_rownames(var = "Name") %>% 
    as.matrix %>% melt %>% plyr::rename(c(Var1 = "pathway", Var2 = "sample_key", 
    value = "mean_expr"))
icgc_loss_summarized_expr <- merge(icgc_loss_summarized, icgc_pathway_df)
df <- icgc_loss_summarized_expr  # %>% subset(pathway %in% c('T-Cell Functions', 'B-Cell Functions', 'Cytotoxicity'))

pvals <- compute_pvals_subsets(df, facet_vars = c("pathway"), corfun = wilcox.test, 
    formula = mean_expr ~ loh)

ggplot(df, aes(x = loh, y = mean_expr)) + geom_boxplot(outlier.size = -1) + 
    geom_jitter(position = position_jitter(width = 0.2, height = 0), alpha = 0.5) + 
    theme_bw() + facet_wrap(~pathway, scales = "free", ncol = 3) + theme_Publication() + 
    theme_nature() + geom_text(data = pvals, aes(x = Inf, y = Inf, label = p.value.text), 
    hjust = 1.1, vjust = 1.5, size = 2.5, parse = TRUE)

We can see a trend, but it’s not significant – likely due to sample size (see first section for sample size numbers).

So we see a tendency towards higher T-cell-associated gene expression among samples with LOHHLA, though this is not significant.

TCGA

At the moment I’m only using those samples for which we have ploidy/cellularity predictions from ASCAT from Nicolai Birkbak – that was from an older version of ASCAT, etc.

I still have to run the newer ASCAT pipeline – ran into a few problems with time earlier that I’ll probably have to resolve by making the ASCAT package more efficient. Got halfway through this then got sidetracked by other analysis …

So these results are really on about 2/3 to 3/4 of the TCGA-OV cohort.

tcga_hlaloss_table <- extract_lohhla_table(lohhla_tcga_outdir)
tcga_hlaloss_table$tcga_sample_id <- stringr::str_extract(tcga_hlaloss_table$region, 
    "TCGA\\-[0-9A-Z]{2}\\-[0-9A-Z]{4}")
tcga_hlaloss_table$analyte_type <- stringr::str_extract(tcga_hlaloss_table$region, 
    "(?<=TCGA\\-[0-9A-Z]{2}\\-[0-9A-Z]{4}\\-[0-9A-Z]{3}\\-[0-9A-Z]{2})[A-Z]")
# tcga_hlaloss_table <- subset(tcga_hlaloss_table, analyte_type == 'D')
hist(tcga_hlaloss_table$HLA_type1copyNum_withBAF, breaks = 30)

hist(tcga_hlaloss_table$HLA_type2copyNum_withBAF, breaks = 30)

Prevalence of HLA loss

By sample key

tcga_loss_table <- construct_loss_table(tcga_hlaloss_table, type = "TCGA", supportive_site_threshold = supportive_site_threshold, 
    mismatch_site_threshold = mismatch_site_threshold)
max_plate <- tcga_loss_table %>% group_by(tcga_sample_id) %>% summarise(max_plate = max(as.numeric(plate)))
tcga_loss_table <- tcga_loss_table %>% plyr::join(max_plate)
tcga_loss_table <- subset(tcga_loss_table, as.numeric(plate) == max_plate)

tcga_loss_summarized <- summarize_loss_table(tcga_loss_table, type = "TCGA")

Comparison with molecular subtypes

tcga_manifest_data <- tcga_bam_manifest %>% subset(specimen_type == "diseased", 
    select = c(center, patient, barcode, sample_key, short_barcode))

nmf_subtypes_merged <- Reduce(function(x, y) plyr::join(x, y, type = "full"), 
    list(nmf_subtypes, tcga_manifest_data, tcga_paper_subtypes)) %>% plyr::rename(c(subtype = "nmf_subtype", 
    secondary = "nmf_secondary_subtype")) %>% subset(select = -c(tcga_sample_id))
# nmf_subtypes_merged_noduplicates <-
# nmf_subtypes_merged[!duplicated(nmf_subtypes_merged$short_barcode),]
nmf_subtypes_merged$tcga_sample_id <- nmf_subtypes_merged$patient
nmf_subtypes_merged_noduplicates <- nmf_subtypes_merged
nmf_subtypes_merged_noduplicates <- subset(nmf_subtypes_merged_noduplicates, 
    !is.na(sample_key)) %>% subset(select = -c(barcode, sample_key, center)) %>% 
    unique
nmf_subtypes_merged_noduplicates <- nmf_subtypes_merged_noduplicates[!duplicated(nmf_subtypes_merged_noduplicates$patient), 
    ]
tcga_loss_summarized$analysis_center <- str_extract(tcga_loss_summarized$sample_key, 
    "(Non\\-)?Broad")

tcga_loss_annotated <- Reduce(function(x, y) plyr::join(x, y, type = "full"), 
    list(tcga_loss_summarized %>% as.data.frame, tcga_ov_annotations, nmf_subtypes_merged_noduplicates))

Subtypes from Yikan’s table

with(tcga_loss_annotated, table(MolecularSubtype, loss))
                loss
MolecularSubtype FALSE TRUE
  Differentiated    61    0
  Immunoreactive    61    3
  Mesenchymal       66    0
  Proliferative     65    2
with(tcga_loss_annotated, table(MolecularSubtype, loss))/rowSums(with(tcga_loss_annotated, 
    table(MolecularSubtype, loss)))
                loss
MolecularSubtype      FALSE       TRUE
  Differentiated 1.00000000 0.00000000
  Immunoreactive 0.95312500 0.04687500
  Mesenchymal    1.00000000 0.00000000
  Proliferative  0.97014925 0.02985075
fisher.test(with(tcga_loss_annotated, table(MolecularSubtype %in% c("Immunoreactive", 
    "Mesenchymal"), loss)))

    Fisher's Exact Test for Count Data

data:  
p-value = 0.3547
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
  0.3061054 32.8301221
sample estimates:
odds ratio 
  2.708565 
with(tcga_loss_annotated, table(MolecularSubtype, loh))
                loh
MolecularSubtype FALSE TRUE
  Differentiated    41   20
  Immunoreactive    45   19
  Mesenchymal       44   22
  Proliferative     45   22
with(tcga_loss_annotated, table(MolecularSubtype, loh))/rowSums(with(tcga_loss_annotated, 
    table(MolecularSubtype, loh)))
                loh
MolecularSubtype     FALSE      TRUE
  Differentiated 0.6721311 0.3278689
  Immunoreactive 0.7031250 0.2968750
  Mesenchymal    0.6666667 0.3333333
  Proliferative  0.6716418 0.3283582

NMF-inferred subtypes

## Not technically a primary site filter here -- this is just a simple Wang
## et al. filter
filter_primary <- FALSE

dat <- tcga_loss_annotated

if (filter_primary) {
    dat <- dat %>% subset(!is.na(MolecularSubtype))
}
with(dat, table(nmf_subtype, loss))
           loss
nmf_subtype FALSE TRUE
         C1    62    0
         C2    97    3
         C4   101    0
         C5    94    2
with(dat, table(nmf_subtype, loss))/rowSums(with(dat, table(nmf_subtype, loss)))
           loss
nmf_subtype      FALSE       TRUE
         C1 1.00000000 0.00000000
         C2 0.97000000 0.03000000
         C4 1.00000000 0.00000000
         C5 0.97916667 0.02083333
with(dat, table(nmf_subtype, loh))
           loh
nmf_subtype FALSE TRUE
         C1    42   20
         C2    72   28
         C4    73   28
         C5    63   33
with(dat, table(nmf_subtype, loh))/rowSums(with(dat, table(nmf_subtype, loh)))
           loh
nmf_subtype     FALSE      TRUE
         C1 0.6774194 0.3225806
         C2 0.7200000 0.2800000
         C4 0.7227723 0.2772277
         C5 0.6562500 0.3437500
with(dat %>% subset(sil_width > 0), table(nmf_subtype, loss))
           loss
nmf_subtype FALSE TRUE
         C1    62    0
         C2    88    3
         C4    68    0
         C5    65    2
with(dat %>% subset(sil_width > 0), table(nmf_subtype, loss))/rowSums(with(dat %>% 
    subset(sil_width > 0), table(nmf_subtype, loss)))
           loss
nmf_subtype      FALSE       TRUE
         C1 1.00000000 0.00000000
         C2 0.96703297 0.03296703
         C4 1.00000000 0.00000000
         C5 0.97014925 0.02985075
with(dat %>% subset(sil_width > 0), table(nmf_subtype, loh))
           loh
nmf_subtype FALSE TRUE
         C1    42   20
         C2    64   27
         C4    48   20
         C5    45   22
with(dat %>% subset(sil_width > 0), table(nmf_subtype, loh))/rowSums(with(dat %>% 
    subset(sil_width > 0), table(nmf_subtype, loh)))
           loh
nmf_subtype     FALSE      TRUE
         C1 0.6774194 0.3225806
         C2 0.7032967 0.2967033
         C4 0.7058824 0.2941176
         C5 0.6716418 0.3283582

Expression differences between NMF-annotated subtypes

tcga_nonstd_expr_df <- tcga_nonstd_expr_mat %>% as.data.frame %>% tibble::column_to_rownames("tcga_sample_id") %>% 
    t %>% as.data.frame %>% tibble::rownames_to_column("Name")
tcga_pathway_matrix_amean <- ithi.expr::create_pathway_matrix(tcga_nonstd_expr_df, 
    nanostring_labels_modified, db_path, convert_ids = FALSE, summary_method = "arithmetic_mean")
tcga_pathway_matrix_gmean <- ithi.expr::create_pathway_matrix(tcga_nonstd_expr_df, 
    nanostring_labels_modified, db_path, convert_ids = FALSE, summary_method = "geometric_mean")

tcga_pathway_matrix_nano <- tcga_pathway_matrix_amean[setdiff(rownames(tcga_pathway_matrix_amean), 
    "Rooney_Cytotoxicity"), ]
tcga_pathway_matrix_rooney <- tcga_pathway_matrix_gmean["Rooney_Cytotoxicity", 
    , drop = FALSE]

tcga_pathway_matrix <- rbind(tcga_pathway_matrix_nano, tcga_pathway_matrix_rooney)

tcga_pathway_df <- tcga_pathway_matrix %>% as.matrix %>% melt %>% plyr::rename(c(Var2 = "short_barcode", 
    Var1 = "pathway", value = "expr"))
tcga_pathway_df_annotated <- plyr::join(tcga_pathway_df, nmf_subtypes_merged_noduplicates)
p <- ggplot(tcga_pathway_df_annotated, aes(x = nmf_subtype, y = expr)) + geom_boxplot() + 
    theme_bw() + theme_Publication() + theme_nature() + facet_wrap(~pathway, 
    scales = "free", ncol = 2)

p

C5 is clearly the lowest, below C4.

Expression difference between LOH and non-LOH samples

tcga_pathway_loss <- tcga_pathway_df_annotated %>% as.data.frame %>% plyr::join(tcga_loss_annotated)
df <- tcga_pathway_loss %>% subset(!is.na(loh))
df$center <- str_extract(df$tcga_sample_id, "(?<=\\-)[0-9A-Z]{2}(?=\\-)")

pvals <- ithi.utils::compute_pvals_subsets(df, facet_vars = c("pathway"), formula = expr ~ 
    loh, corfun = wilcox.test)

p <- ggplot(df, aes(x = loh, y = expr)) + geom_boxplot() + theme_bw() + theme_Publication() + 
    theme_nature() + facet_wrap(~pathway, scales = "free", ncol = 2) + geom_text(data = pvals, 
    aes(x = Inf, y = Inf, label = p.value.text), hjust = 1.1, vjust = 1.5, size = 2.5, 
    parse = TRUE)

p

I also did this with TCGA’s RNA-seq data in another file and found nothing.

By center

df_subset <- subset(df, pathway == "Rooney_Cytotoxicity" & !is.na(loh) & !is.na(center))

pvals <- ithi.utils::compute_pvals_subsets(df_subset, facet_vars = c("center"), 
    formula = expr ~ loh, corfun = wilcox.test)

p <- ggplot(df_subset, aes(x = loh, y = expr)) + geom_boxplot(outlier.size = -1) + 
    geom_jitter(position = position_jitter(width = 0.2, height = 0), alpha = 0.5) + 
    theme_bw() + theme_Publication() + theme_nature() + facet_wrap(~center, 
    scales = "free", ncol = 2) + geom_text(data = pvals, aes(x = Inf, y = Inf, 
    label = p.value.text), hjust = 1.1, vjust = 1.5, size = 2.5, parse = TRUE)

table(df_subset$center)

04 09 10 13 20 23 24 25 29 30 31 36 42 57 59 61 
26 15  9 75  9 29 60 12 35  6  3 14  7  3  3 47 
p

The UPenn center shows something, but the other 2 big ones – Broad and MSKCC – do not.

TCGA-annotated subtypes

with(dat %>% subset(paper_pos_silhouette), table(tcga_paper_subtype, loss))
                  loss
tcga_paper_subtype FALSE TRUE
    Differentiated    61    0
    Immunoreactive    60    3
    Mesenchymal       66    0
    Proliferative     65    2
with(dat %>% subset(paper_pos_silhouette), table(tcga_paper_subtype, loss))/rowSums(with(dat %>% 
    subset(paper_pos_silhouette), table(tcga_paper_subtype, loss)))
                  loss
tcga_paper_subtype      FALSE       TRUE
    Differentiated 1.00000000 0.00000000
    Immunoreactive 0.95238095 0.04761905
    Mesenchymal    1.00000000 0.00000000
    Proliferative  0.97014925 0.02985075
with(dat %>% subset(paper_pos_silhouette), table(tcga_paper_subtype, loh))
                  loh
tcga_paper_subtype FALSE TRUE
    Differentiated    41   20
    Immunoreactive    44   19
    Mesenchymal       45   21
    Proliferative     45   22
with(dat %>% subset(paper_pos_silhouette), table(tcga_paper_subtype, loh))/rowSums(with(dat %>% 
    subset(paper_pos_silhouette), table(tcga_paper_subtype, loh)))
                  loh
tcga_paper_subtype     FALSE      TRUE
    Differentiated 0.6721311 0.3278689
    Immunoreactive 0.6984127 0.3015873
    Mesenchymal    0.6818182 0.3181818
    Proliferative  0.6716418 0.3283582

Comparison with foldback inversion/HRD subtypes

with(dat, table(Subgroup, loss))
              loss
Subgroup       FALSE TRUE
  FBI-AMP High   140    2
  FBI-AMP Low    161    1
  No AMP          54    2
with(dat, table(Subgroup, loh))
              loh
Subgroup       FALSE TRUE
  FBI-AMP High    98   44
  FBI-AMP Low    111   51
  No AMP          41   15

Not really any difference betwen foldback subsets in terms of this.

---
title: "LOHHLA -- external datasets"
---
                        ```{r, echo=FALSE, message=FALSE, warning=FALSE}

######## Snakemake header ########
library(methods)
Snakemake <- setClass(
    "Snakemake",
    slots = c(
        input = "list",
        output = "list",
        params = "list",
        wildcards = "list",
        threads = "numeric",
        log = "list",
        resources = "list",
        config = "list",
        rule = "character"
    )
)
snakemake <- Snakemake(
    input = list('/shahlab/alzhang/data/ICGC/OVAU_expr_matrix.tsv', '/shahlab/alzhang/projects/ITH_Immune/data/expression/array/subtypes/nmf_subtypes.txt', '/shahlab/alzhang/data/TCGA/tcga_ov_annotation_sup13.txt', '/shahlab/alzhang/pipeline_outputs/ith_immune/lohhla/run1_icgc', '/shahlab/alzhang/data/TCGA/TCGA_489_UE.k4.txt', '/shahlab/alzhang/data/ICGC/specimen.tsv', '/shahlab/archive/immune_project/TCGA-OV/bam', '/shahlab/alzhang/data/TCGA/TCGA_489_UE.k4.posSilhouette.txt', 'notebooks/lohhla_tcga_icgc.Rmd', '/shahlab/alzhang/projects/ITH_Immune/data/expression/nanostring/pancancer_annotations.tsv', '/shahlab/alzhang/pipeline_outputs/ith_immune/lohhla/run8_tcga', '/shahlab/alzhang/data/TCGA/expr_matrix_normalize_noduplicates.tsv', '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/ith_icgc_merged_bc.tsv', '/shahlab/alzhang/data/ICGC/icgc_primary_tumour_subtypes.tsv', "icgc_expr_mat" = '/shahlab/alzhang/data/ICGC/OVAU_expr_matrix.tsv', "array_nmf_subtypes" = '/shahlab/alzhang/projects/ITH_Immune/data/expression/array/subtypes/nmf_subtypes.txt', "tcga_ov_annotations" = '/shahlab/alzhang/data/TCGA/tcga_ov_annotation_sup13.txt', "lohhla_icgc_outdir" = '/shahlab/alzhang/pipeline_outputs/ith_immune/lohhla/run1_icgc', "tcga_paper_subtypes_raw" = '/shahlab/alzhang/data/TCGA/TCGA_489_UE.k4.txt', "icgc_specimen" = '/shahlab/alzhang/data/ICGC/specimen.tsv', "tcga_ov_bam_dir" = '/shahlab/archive/immune_project/TCGA-OV/bam', "tcga_paper_subtypes_possilhouette" = '/shahlab/alzhang/data/TCGA/TCGA_489_UE.k4.posSilhouette.txt', "notebook" = 'notebooks/lohhla_tcga_icgc.Rmd', "nanostring_annotations" = '/shahlab/alzhang/projects/ITH_Immune/data/expression/nanostring/pancancer_annotations.tsv', "lohhla_tcga_outdir" = '/shahlab/alzhang/pipeline_outputs/ith_immune/lohhla/run8_tcga', "tcga_nonstd_expr" = '/shahlab/alzhang/data/TCGA/expr_matrix_normalize_noduplicates.tsv', "ith_icgc_bc" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/ith_icgc_merged_bc.tsv', "icgc_subtypes" = '/shahlab/alzhang/data/ICGC/icgc_primary_tumour_subtypes.tsv'),
    output = list('/shahlab/alzhang/projects/ITH_Immune/paper/results/review/notebooks/run2/lohhla_tcga_icgc.nb.html'),
    params = list(90.0, '/shahlab/alzhang/projects/ITH_Immune/metadata/db/immune_project.sqlite3', 5.0, 'lohhla_external_analysis', "lohhla_supportive_site_threshold" = 90.0, "db" = '/shahlab/alzhang/projects/ITH_Immune/metadata/db/immune_project.sqlite3', "lohhla_mismatch_site_threshold" = 5.0, "name" = 'lohhla_external_analysis'),
    wildcards = list(),
    threads = 1,
    log = list('/shahlab/alzhang/clusttmp/paperreview2/notebooks/lohhla_external_analysis.log'),
    resources = list(),
    config = list("lohhla_supportive_site_threshold" = 90.0, "array_nmf_subtypes" = '/shahlab/alzhang/projects/ITH_Immune/data/expression/array/subtypes/nmf_subtypes.txt', "patients_for_clonal" = c(1, 2, 3, 4, 7, 9, 10, 11, 12, 13, 14, 15, 16, 17), "bcr_diversity" = '/shahlab/alzhang/pipeline_outputs/ith_immune/mixcr/mixcr_runs/ith_1_2_3/mixcr5/postprocess/IGH/postfilter_diversity_stats/diversity.strict.resampled.txt', "snv_table" = '/shahlab/amcpherson/projects/ith3/ith3/notebooks/bespoke/ith_snvs.tsv', "mmctm_final_patient_dir" = '/shahlab/alzhang/projects/ITH_Immune/results/mmctm_results/ith_by-patient_with-ov', "tcr_diversity" = '/shahlab/alzhang/pipeline_outputs/ith_immune/mixcr/mixcr_runs/ith_1_2_3/mixcr5/postprocess/TRB/postfilter_diversity_stats/diversity.strict.resampled.txt', "clola_result_file" = '/shahlab/alzhang/pipeline_outputs/ith_immune/clola/run4/clola_condensed_results/beta/clola_results.tsv', "rooney_mutsigcv_file" = '/shahlab/alzhang/projects/ITH_Immune/external/other_papers/mmc6.xlsx', "epitopes_unique_filtered" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/epitopes_unique_filtered.tsv', "total_tiltypes" = c('T_CD8_density', 'T_CD4_density', 'T_CD20_density', 'T_Plasma_density'), "copynumber_table" = '/shahlab/alzhang/data/ithi/master_copynumber_file.tsv', "xcr_table" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/xcr_table.tsv', "ith_stats" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/ith_statistics.tsv', "tcga_nonstd_expr" = '/shahlab/alzhang/data/TCGA/expr_matrix_normalize_noduplicates.tsv', "igpartition_outdir" = '/shahlab/alzhang/pipeline_outputs/ith_immune/igpartition/run22', "all_tiltypes" = c('T_CD8_density', 'T_CD4_density', 'T_CD20_density', 'T_Plasma_density', 'E_CD8_density', 'E_CD4_density', 'E_CD20_density', 'E_Plasma_density', 'S_CD8_density', 'S_CD4_density', 'S_CD20_density', 'S_Plasma_density'), "snv_cluster_dir" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/snv_cluster', "icgc_expr_mat" = '/shahlab/alzhang/data/ICGC/OVAU_expr_matrix.tsv', "lohhla_icgc_outdir" = '/shahlab/alzhang/pipeline_outputs/ith_immune/lohhla/run1_icgc', "breakpoint_table" = '/shahlab/amcpherson/projects/ith3/ith3/notebooks/bespoke/ith_breakpoints.tsv', "he_results_dir" = '/shahlab/alzhang/data/ithi/finn_results/he_output_Nov29', "tumour_purity" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/tumour_purity.tsv', "lohhla_mismatch_site_threshold" = 5.0, "tcga_ov_bam_dir" = '/shahlab/archive/immune_project/TCGA-OV/bam', "prevalence_threshold" = 0.01, "image_summary2" = '/shahlab/alzhang/data/ithi/yuan_hecr_image_results_2.csv', "distance_method" = 'horn', "finnhe_pipeline_results_dir" = '/shahlab/alzhang/pipeline_outputs/ith_immune/finnhe/run1', "notebook_dir" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/review/notebooks/run2', "somatic_coding_result_dir" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/somatic_coding_variants', "table_dir" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/review/tables/run2', "refseq_gene_file" = '/shahlab/alzhang/data/genome/hg19/refseq_genes.bed', "ith_stat_types" = c('entropy', 'postprocessed_divergence', 'combined_ith_normalized', 'proportion_subclonal'), "remixt_cellularity_ploidy" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/remixt_cellularity_ploidy.tsv', "logdir" = '/shahlab/alzhang/clusttmp/paperreview2', "tcga_ov_annotations" = '/shahlab/alzhang/data/TCGA/tcga_ov_annotation_sup13.txt', "ihc_features_output" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/intermediates/run2/ihc_features_output.txt', "molsubtypes" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/molsubtypes.tsv', "clone_prevalences" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/clone_data.tsv', "array_expression_file" = '/shahlab/alzhang/projects/ITH_Immune/data/expression/array/gene_exprs_rma_batch_corrected.txt', "benchmarkdir" = '/shahlab/alzhang/benchmarks/paperreview2', "clone_branch_lengths" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/branch_data.tsv', "nanostring_annotations" = '/shahlab/alzhang/projects/ITH_Immune/data/expression/nanostring/pancancer_annotations.tsv', "lohhla_tcga_outdir" = '/shahlab/alzhang/pipeline_outputs/ith_immune/lohhla/run8_tcga', "tils_for_cluster" = c('E_CD8_density', 'E_CD4_density', 'E_CD20_density', 'E_Plasma_density', 'S_CD8_density', 'S_CD4_density', 'S_CD20_density', 'S_Plasma_density'), "ith_icgc_bc" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/ith_icgc_merged_bc.tsv', "image_summary" = '/shahlab/alzhang/data/ithi/yuan_hecr_image_results.csv', "ihc_table" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/ihc_table.tsv', "db" = '/shahlab/alzhang/projects/ITH_Immune/metadata/db/immune_project.sqlite3', "known_subtypes_array" = '/shahlab/alzhang/projects/ITH_Immune/data/expression/array/subtypes/known_subtypes.tsv', "tilcluster_supervised_ipynb" = '/shahlab/alzhang/projects/ITH_Immune/paper/review/ipy/tilcluster_supervisedmulticlass.ipynb', "tcga_paper_subtypes_raw" = '/shahlab/alzhang/data/TCGA/TCGA_489_UE.k4.txt', "neoediting_outdir" = '/shahlab/alzhang/pipeline_outputs/ith_immune/neoediting/run6', "clone_trees" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/tables/run2/clones/tree_data.tsv', "tcga_paper_subtypes_possilhouette" = '/shahlab/alzhang/data/TCGA/TCGA_489_UE.k4.posSilhouette.txt', "tils_for_variability" = c('T_CD8_density', 'T_CD4_density', 'T_CD20_density', 'T_Plasma_density'), "til_clusters_output" = '/shahlab/alzhang/projects/ITH_Immune/paper/results/intermediates/run2/til_clusters_output.txt', "icgc_specimen" = '/shahlab/alzhang/data/ICGC/specimen.tsv', "icgc_subtypes" = '/shahlab/alzhang/data/ICGC/icgc_primary_tumour_subtypes.tsv', "nanostring_data" = '/shahlab/alzhang/projects/ITH_Immune/results/nanostring_results/ith_full/qc/limma_quantile/normalized_expression_voa_labels_filtered.tsv', "variability_type" = 'stabilize'),
    rule = 'lohhla_external_analysis'
)
######## Original script #########

                        ```


## Setup

```{r global_chunk_options, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, tidy=TRUE, warning=FALSE, message=FALSE, cache=TRUE) #cache=TRUE
```

```{r}
library(ithi.utils)
load_base_libs()
library(ithi.meta)
library(ithi.figures)
library(ithi.expr)
library(ithi.external)
library(ithi.lohhla)
```

```{r}
lohhla_icgc_outdir <- snakemake@input$lohhla_icgc_outdir
ith_icgc_bc_file <- snakemake@input$ith_icgc_bc
icgc_specimen_file <- snakemake@input$icgc_specimen
nanostring_annotations_path <- snakemake@input$nanostring_annotations
icgc_subtype_file <- snakemake@input$icgc_subtypes
icgc_expr_mat_file <- snakemake@input$icgc_expr_mat

lohhla_tcga_outdir <- snakemake@input$lohhla_tcga_outdir
tcga_ov_annotation_file <- snakemake@input$tcga_ov_annotations
nmf_subtype_file <- snakemake@input$array_nmf_subtypes
tcga_paper_subtype_file <- snakemake@input$tcga_paper_subtypes_raw
tcga_paper_silhouette_file <- snakemake@input$tcga_paper_subtypes_possilhouette
tcga_ov_bam_dir <- snakemake@input$tcga_ov_bam_dir
tcga_nonstd_expr_mat_file <- snakemake@input$tcga_nonstd_expr

db_path <- snakemake@params$db
supportive_site_threshold <- as.numeric(snakemake@params$lohhla_supportive_site_threshold)
mismatch_site_threshold <- as.numeric(snakemake@params$lohhla_mismatch_site_threshold)
```

```{r}
icgc_specimen <- fread(icgc_specimen_file)
ith_icgc_bc <- fread(ith_icgc_bc_file)
icgc_subtypes <- fread(icgc_subtype_file)
icgc_expr_mat <- fread(icgc_expr_mat_file)

nmf_subtypes <- read_tcga_nmf_subtypes(nmf_subtype_file)
tcga_paper_subtypes <- read_tcga_paper_subtypes(tcga_paper_subtype_file, tcga_paper_silhouette_file)
tcga_nonstd_expr_mat <- fread(tcga_nonstd_expr_mat_file)
tcga_ov_annotations <- fread(tcga_ov_annotation_file)

nanostring_labels <- fread(nanostring_annotations_path)
nanostring_labels_modified <- read_nanostring_labels_modified(nanostring_annotations_path)

tcga_bam_manifest <- create_bam_manifest(tcga_ov_bam_dir, ucec_format = FALSE, local = FALSE)
```


Here we're going to run LOHHLA on ICGC and TCGA. 

A few things to note:

* ICGC is lower sequencing depth
* TCGA-OV was the earliest TCGA study and is not as high-quality as the studies after it -- comparisons may need to be taken with a pinch of salt

## ICGC

```{r}
icgc_hlaloss_table <- extract_lohhla_table(lohhla_icgc_outdir)
```

```{r}
icgc_loss_table <- construct_loss_table(icgc_hlaloss_table, type = "ICGC", supportive_site_threshold = supportive_site_threshold,
                                        mismatch_site_threshold = mismatch_site_threshold)
icgc_loss_summarized <- summarize_loss_table(icgc_loss_table, type = "ICGC")
```

```{r}
table(icgc_loss_summarized$loh)
```

### vs. Molecular subtypes

```{r}
icgc_loss_summarized_subtypes <- merge(icgc_loss_summarized, icgc_subtypes, by.x = c("sample_key"), by.y = c("icgc_donor_id"))

with(icgc_loss_summarized_subtypes, table(loh, subtype))

with(icgc_loss_summarized_subtypes, table(loh, nmf_subtype))

fisher.test(with(icgc_loss_summarized_subtypes, table(loh, nmf_subtype %in% c("C1", "C2"))))
```

#### Missing samples

```{r}
subset(icgc_subtypes, icgc_donor_id %in% c("DO46388", "DO46390", "DO46571"))
```

The other 4 samples have no suitable HLA alleles (i.e. they're homozygous at all 3 HLA loci). These 3 samples had the error that is due to no reads mapping to one of the alleles. If these samples are true 'LOH', their molecular subtype assignments are not surprising in light of the previous results -- 2/3 are C1 samples, by either classification. 

### vs. Expression patterns

#### Batch corrected file with our cohort

```{r}
icgc_pathway_matrix <- ithi.expr::create_pathway_matrix(ith_icgc_bc, nanostring_labels, db_path, 
                                                   convert_ids = FALSE, summary_method = "arithmetic_mean")

icgc_pathway_matrix_summarized <- summarize_expression_by_patient(icgc_pathway_matrix, icgc_specimen)

icgc_pathway_df <- icgc_pathway_matrix_summarized %>% tibble::column_to_rownames(var = "Name") %>% as.matrix %>% melt %>%
  plyr::rename(c('Var1'='pathway', 'Var2'='sample_key', 'value'='mean_expr'))
```

```{r}
icgc_loss_summarized_expr <- merge(icgc_loss_summarized, icgc_pathway_df)
```

```{r, fig.height = 12, fig.width = 7}
df <- icgc_loss_summarized_expr# %>% subset(pathway %in% c("T-Cell Functions", "B-Cell Functions", "Cytotoxicity"))

pvals <- compute_pvals_subsets(df, facet_vars = c("pathway"), corfun = wilcox.test, formula = mean_expr ~ loh)

ggplot(df, aes(x=loh, y = mean_expr)) + geom_boxplot(outlier.size = -1) + geom_jitter(position = position_jitter(width = 0.2, height = 0), alpha = 0.5) +
  theme_bw() + facet_wrap(~ pathway, scales = "free", ncol = 3) + 
  theme_Publication() + theme_nature() + geom_text(data=pvals, aes(x=Inf, y=Inf, label=p.value.text), hjust=1.1, vjust=1.5,size=2.5,parse=TRUE)
```


We can see a trend, but it's not significant -- likely due to sample size (see first section for sample size numbers). 

So we see a tendency towards higher T-cell-associated gene expression among samples with LOHHLA, though this is not significant. 


## TCGA

At the moment I'm only using those samples for which we have ploidy/cellularity predictions from ASCAT from Nicolai Birkbak -- that was from an older version of ASCAT, etc. 

I still have to run the newer ASCAT pipeline -- ran into a few problems with time earlier that I'll probably have to resolve by making the ASCAT package more efficient. Got halfway through this then got sidetracked by other analysis ...

So these results are really on about 2/3 to 3/4 of the TCGA-OV cohort. 

```{r}
tcga_hlaloss_table <- extract_lohhla_table(lohhla_tcga_outdir)
tcga_hlaloss_table$tcga_sample_id <- stringr::str_extract(tcga_hlaloss_table$region, "TCGA\\-[0-9A-Z]{2}\\-[0-9A-Z]{4}")
tcga_hlaloss_table$analyte_type <- stringr::str_extract(tcga_hlaloss_table$region, "(?<=TCGA\\-[0-9A-Z]{2}\\-[0-9A-Z]{4}\\-[0-9A-Z]{3}\\-[0-9A-Z]{2})[A-Z]")
#tcga_hlaloss_table <- subset(tcga_hlaloss_table, analyte_type == "D")
```

```{r}
hist(tcga_hlaloss_table$HLA_type1copyNum_withBAF, breaks = 30)
hist(tcga_hlaloss_table$HLA_type2copyNum_withBAF, breaks = 30)
```

### Prevalence of HLA loss

#### By sample key

```{r}
tcga_loss_table <- construct_loss_table(tcga_hlaloss_table, type = "TCGA", supportive_site_threshold = supportive_site_threshold, mismatch_site_threshold = mismatch_site_threshold)
max_plate <- tcga_loss_table %>% group_by(tcga_sample_id) %>% summarise(max_plate=max(as.numeric(plate)))
tcga_loss_table <- tcga_loss_table %>% plyr::join(max_plate)
tcga_loss_table <- subset(tcga_loss_table, as.numeric(plate) == max_plate)

tcga_loss_summarized <- summarize_loss_table(tcga_loss_table, type = "TCGA")
```

### Comparison with molecular subtypes

```{r}
tcga_manifest_data <- tcga_bam_manifest %>% subset(specimen_type == "diseased", select=c(center, patient, barcode, sample_key, short_barcode))

nmf_subtypes_merged <- Reduce(function(x,y) plyr::join(x,y,type='full'), list(
  nmf_subtypes,
  tcga_manifest_data,
  tcga_paper_subtypes
)) %>% plyr::rename(c('subtype'='nmf_subtype', 'secondary'='nmf_secondary_subtype')) %>% subset(select=-c(tcga_sample_id))
#nmf_subtypes_merged_noduplicates <- nmf_subtypes_merged[!duplicated(nmf_subtypes_merged$short_barcode),]
nmf_subtypes_merged$tcga_sample_id <- nmf_subtypes_merged$patient
nmf_subtypes_merged_noduplicates <- nmf_subtypes_merged
nmf_subtypes_merged_noduplicates <- subset(nmf_subtypes_merged_noduplicates, !is.na(sample_key)) %>% subset(select=-c(barcode, sample_key, center)) %>% unique
nmf_subtypes_merged_noduplicates <- nmf_subtypes_merged_noduplicates[!duplicated(nmf_subtypes_merged_noduplicates$patient),]
```

```{r}
tcga_loss_summarized$analysis_center <- str_extract(tcga_loss_summarized$sample_key, "(Non\\-)?Broad")

tcga_loss_annotated <- Reduce(function(x,y) plyr::join(x, y, type = 'full'), list(tcga_loss_summarized %>% as.data.frame, tcga_ov_annotations, nmf_subtypes_merged_noduplicates))
```

#### Subtypes from Yikan's table

```{r}
with(tcga_loss_annotated, table(MolecularSubtype, loss))

with(tcga_loss_annotated, table(MolecularSubtype, loss))/rowSums(with(tcga_loss_annotated, table(MolecularSubtype, loss)))

fisher.test(with(tcga_loss_annotated, table(MolecularSubtype %in% c("Immunoreactive", "Mesenchymal"), loss)))
```

```{r}
with(tcga_loss_annotated, table(MolecularSubtype, loh))

with(tcga_loss_annotated, table(MolecularSubtype, loh))/rowSums(with(tcga_loss_annotated, table(MolecularSubtype, loh)))
```


#### NMF-inferred subtypes

```{r}
## Not technically a primary site filter here -- this is just a simple Wang et al. filter
filter_primary <- FALSE

dat <- tcga_loss_annotated

if (filter_primary) {
  dat <- dat %>% subset(!is.na(MolecularSubtype))
}
```

```{r}
with(dat, table(nmf_subtype, loss))

with(dat, table(nmf_subtype, loss))/rowSums(with(dat, table(nmf_subtype, loss)))
```

```{r}
with(dat, table(nmf_subtype, loh))

with(dat, table(nmf_subtype, loh))/rowSums(with(dat, table(nmf_subtype, loh)))
```

```{r}
with(dat %>% subset(sil_width > 0), table(nmf_subtype, loss))

with(dat %>% subset(sil_width > 0), table(nmf_subtype, loss))/rowSums(with(dat %>% subset(sil_width > 0), table(nmf_subtype, loss)))
```

```{r}
with(dat %>% subset(sil_width > 0), table(nmf_subtype, loh))

with(dat %>% subset(sil_width > 0), table(nmf_subtype, loh))/rowSums(with(dat %>% subset(sil_width > 0), table(nmf_subtype, loh)))
```

### Expression differences between NMF-annotated subtypes

```{r}
tcga_nonstd_expr_df <- tcga_nonstd_expr_mat %>% as.data.frame %>% tibble::column_to_rownames("tcga_sample_id") %>% t %>% as.data.frame %>% tibble::rownames_to_column("Name")
tcga_pathway_matrix_amean <- ithi.expr::create_pathway_matrix(tcga_nonstd_expr_df, nanostring_labels_modified, db_path, convert_ids = FALSE, summary_method = "arithmetic_mean")
tcga_pathway_matrix_gmean <- ithi.expr::create_pathway_matrix(tcga_nonstd_expr_df, nanostring_labels_modified, db_path, convert_ids = FALSE, summary_method = "geometric_mean")

tcga_pathway_matrix_nano <- tcga_pathway_matrix_amean[setdiff(rownames(tcga_pathway_matrix_amean), "Rooney_Cytotoxicity"),]
tcga_pathway_matrix_rooney <- tcga_pathway_matrix_gmean["Rooney_Cytotoxicity",,drop=FALSE]

tcga_pathway_matrix <- rbind(tcga_pathway_matrix_nano, tcga_pathway_matrix_rooney)

tcga_pathway_df <- tcga_pathway_matrix %>% as.matrix %>% melt %>% plyr::rename(c('Var2'='short_barcode', 'Var1'='pathway', 'value'='expr'))
```


```{r}
tcga_pathway_df_annotated <- plyr::join(tcga_pathway_df, nmf_subtypes_merged_noduplicates)
```

```{r, fig.height = 12, fig.width = 5}
p <- ggplot(tcga_pathway_df_annotated, aes(x=nmf_subtype, y = expr)) + geom_boxplot() + theme_bw() + theme_Publication() + theme_nature() + facet_wrap(~ pathway, scales = "free", ncol = 2)

p
```

C5 is clearly the lowest, below C4. 

### Expression difference between LOH and non-LOH samples

```{r}
tcga_pathway_loss <- tcga_pathway_df_annotated %>% as.data.frame %>% plyr::join(tcga_loss_annotated)
```

```{r, fig.height = 18, fig.width = 5}
df <- tcga_pathway_loss %>% subset(!is.na(loh))
df$center <- str_extract(df$tcga_sample_id, "(?<=\\-)[0-9A-Z]{2}(?=\\-)")

pvals <- ithi.utils::compute_pvals_subsets(df, facet_vars = c("pathway"), formula = expr ~ loh, corfun = wilcox.test)

p <- ggplot(df, aes(x=loh, y = expr)) + geom_boxplot() + theme_bw() + theme_Publication() + theme_nature() + facet_wrap(~ pathway, scales = "free", ncol = 2) + geom_text(data=pvals, aes(x=Inf, y=Inf, label=p.value.text), hjust=1.1, vjust=1.5,size=2.5,parse=TRUE) 

p
```

I also did this with TCGA's RNA-seq data in another file and found nothing. 

#### By center

```{r, fig.height = 18, fig.width = 5}
df_subset <- subset(df, pathway == "Rooney_Cytotoxicity" & !is.na(loh) & !is.na(center))

pvals <- ithi.utils::compute_pvals_subsets(df_subset, facet_vars = c("center"), formula = expr ~ loh, corfun = wilcox.test)

p <- ggplot(df_subset, aes(x=loh, y = expr)) + geom_boxplot(outlier.size = -1) + geom_jitter(position = position_jitter(width = 0.2, height = 0), alpha = 0.5) + theme_bw() + theme_Publication() + theme_nature() + facet_wrap(~ center, scales = "free", ncol = 2) + geom_text(data=pvals, aes(x=Inf, y=Inf, label=p.value.text), hjust=1.1, vjust=1.5,size=2.5,parse=TRUE)

table(df_subset$center)

p
```

The UPenn center shows something, but the other 2 big ones -- Broad and MSKCC -- do not. 

#### TCGA-annotated subtypes

```{r}
with(dat %>% subset(paper_pos_silhouette), table(tcga_paper_subtype, loss))

with(dat %>% subset(paper_pos_silhouette), table(tcga_paper_subtype, loss))/rowSums(with(dat %>% subset(paper_pos_silhouette), table(tcga_paper_subtype, loss)))
```

```{r}
with(dat %>% subset(paper_pos_silhouette), table(tcga_paper_subtype, loh))

with(dat %>% subset(paper_pos_silhouette), table(tcga_paper_subtype, loh))/rowSums(with(dat %>% subset(paper_pos_silhouette), table(tcga_paper_subtype, loh)))
```


### Comparison with foldback inversion/HRD subtypes

```{r}
with(dat, table(Subgroup, loss))
```

```{r}
with(dat, table(Subgroup, loh))
```

Not really any difference betwen foldback subsets in terms of this. 



