Data Loading and Tidying

RNA-seq data

Input file: /Users/brunogrande/Repos/dlbcl_coo_classification/data/expr.tsv

How it was generated: The RNA-seq data was aligned using TopHat2 and the reads were counted for each gene using htseq featureCount. Only protein-coding CDS regions were used as features. The expression counts from individual samples were consolidated into this file by running the following command on Thanos.

# On thanos, working directory: /home/rmorin/work/expression/htseq_rerun/
files=(DLC*.htseq)
(echo gene ${files%.*} | tr " " "\t"; paste ${files} | cut -f1,$(echo {2..$(expr 2 \* ${#files[@]})..2} | tr " " ",")) > expr.tsv

The RNA-seq expression data is loaded as a data frame and converted into a matrix. No further processing is done.

expr_df <- 
  readr::read_tsv(expr_path, progress = FALSE) %>%
  dplyr::filter(!grepl("^__", gene))

expr <- 
  expr_df %>%
  dplyr::select(-gene) %>%
  as.matrix() %>%
  magrittr::set_rownames(expr_df$gene)

# Ensure no NAs
testthat::expect_false(any(is.na(expr)))

patients_rnaseq <- colnames(expr)
gene_ids <- rownames(expr)

# Ensure no duplicates
testthat::expect_false(any(duplicated(patients_rnaseq)))
testthat::expect_false(any(duplicated(gene_ids)))

Lymph2Cx COO status

Input file: /Users/brunogrande/Repos/dlbcl_coo_classification/data/coo.tsv

How it was generated: This file contains the COO classification from the NanoString Lymph2Cx assay for the entire DLC cohort. It was copied from Thanos: /home/rmorin/work/expression/coo_fix.txt.

For the NanoString COO classification, I’m annotating the COO codes (1, 2 or 3) with meaningful names. I’m also creating a fourth class of samples called “NA”. Lastly, I’m filtering this data frame to only include patients in the RNA-seq dataset.

coo <-
  readr::read_tsv(coo_path, col_names = c("patient", "code"), skip = 1) %>%
  dplyr::left_join(coo_codes, by = "code") %>%
  dplyr::select(-code) %>%
  dplyr::mutate(coo_ns = ifelse(is.na(coo_ns), "NA", coo_ns))

# I'm moving the patient column to rownames so `coo` can be used directly in pheatmap
coo_annot <- 
  coo %>%
  as.data.frame() %>%
  tibble::column_to_rownames("patient")

# I'm also creating a version for just RNA-seq samples
coo_annot_rnaseq <- coo_annot[patients_rnaseq, , drop = FALSE]

# Ensure no NAs
testthat::expect_false(any(is.na(coo)))

Normal Content

Input file: /Users/brunogrande/Repos/dlbcl_coo_classification/data/normal_content.tsv

How it was generated: This file contains the normal tissue content estimates by OncoSNP. N.B. The patients in this dataset do not perfectly overlap with patients in the Lymph2Cx dataset.

# On gphost01, working directory: /genesis/extscratch/shahlab/ssaberi/ONCOSNP/stromal-intra
awk 'BEGIN {FS=OFS="\t"; print "patient", "nc"} $8 == 2 {print FILENAME,$2}' *.qc | sed 's/\.qc//' > /projects/bgrande/experiments/2016-08-10_dlbcl_coo_classification/data/normal_content.tsv

nc <- readr::read_tsv(nc_path)

# Caution: Patients unique to either one of the datasets
sym_diff(nc$patient, coo$patient)

 [1] "DLC0020" "DLC0026" "DLC0032" "DLC0034" "DLC0040" "DLC0072" "DLC0075" "DLC0102" "DLC0112"
[10] "DLC0132" "DLC0144" "DLC0153" "DLC0159" "DLC0180" "DLC0184" "DLC0215" "DLC0223" "DLC0250"
[19] "DLC0257" "DLC0264" "DLC0265" "DLC0266" "DLC0290" "DLC0302" "DLC0345" "DLC0346" "DLC0349"
[28] "DLC0350" "DLC0365" "DLC0373" "DLC0377" "DLC0381" "DLC0387" "DLC0014" "DLC0125" "DLC0130"
[37] "DLC0270" "DLC0380" "DLC0384" "DLC0399" "DLC0401" "DLC0403"

# For now, I will restrict this dataset to those present in the COO
nc <- dplyr::filter(nc, patient %in% coo$patient)

Mutation data

SNVs and Indels

Input file: /Users/brunogrande/Repos/dlbcl_coo_classification/data/snvs_and_indels.maf

How it was generated: This file contains mutation data from our lab’s lymphoma gene hybridization capture and sequencing experiment in MAF format. It was copied from the Brainiac NAS: morinlab/projects/2016_dlbcl_dlc_lymphoma_gene_pool/analysis/strelka_pipeline/10-post_process/clean.passed.all.filt.maf.

snvs_and_indels <- 
  readr::read_tsv(snvs_and_indels_path, progress = FALSE) %>%
  dplyr::filter(!is.na(HGVSp_Short),
                HGVSp_Short != "p.=")

muts <-
  snvs_and_indels %>%
  dplyr::mutate(patient = sub("_", "", Tumor_Sample_Barcode),
                change = sub("^p[.]", "", HGVSp_Short),
                type = get_type(Consequence),
                codon = get_codon(type, change)) %>%
  dplyr::select(patient, gene = Hugo_Symbol, change, type, codon)

# Ensure no NAs
testthat::expect_false(any(is.na(muts)))

# Ensure that all patients are included in the coo data frame
testthat::expect_length(setdiff(muts$patient, coo$patient), 0)

Input file: /Users/brunogrande/Repos/dlbcl_coo_classification/data/mutations.tsv

How it was generated: This file contains mutation data from our lab’s lymphoma gene hybridization capture and sequencing experiment in Oncoprinter format. This file was manually edited by Ryan to include additional mutations that weren’t included in the original Oncoprinter file (e.g. CNVs affecting NFKBIZ). It was copied from Thanos: /home/rmorin/work/expression/passed.oncoprinter.tsv.

For the mutation data, I’m also filtering for patients in the RNA-seq dataset. I am excluding any incomplete rows, which are normally added to Oncoprinter files to ensure correct cohort-wide gene mutation rates.

oncoprint_cols <- c("patient", "gene", "change", "type")

muts_raw <- suppressWarnings(
  readr::read_tsv(muts_path, skip = 1, col_names = oncoprint_cols, progress = FALSE)) 

muts_orig <- 
  muts_raw %>%
  dplyr::filter(!is.na(gene),
                patient %in% patients_rnaseq) %>%
  dplyr::mutate(type = tolower(type),
                codon = get_codon(type, change)) %>%
  dplyr::filter(!(gene == "NFKBIZ" & change == "GAIN"))

# Ensure no NAs
testthat::expect_false(any(is.na(muts_orig)))

N.B. There are 5 patients represented in the RNA-seq data but not in the mutation data. This is caused by the incomplete status of the targeted sequencing experiment.

Gene symbols

To annotate the gene IDs in the RNA-seq dataset, I’m using the BioMart API to obtain HGNC gene symbols from Ensembl 75 (Feb. 2014), which is the last version updated for GRCh37. If no gene symbol was found, I fall back on the gene ID.

ensembl_75 <- 
  biomaRt::useMart(biomart = "ENSEMBL_MART_ENSEMBL", 
                   host = "feb2014.archive.ensembl.org", 
                   dataset = "hsapiens_gene_ensembl")

bm_attrs <- c("ensembl_gene_id", "hgnc_symbol", "entrezgene")

bm <- 
  biomaRt::getBM(mart = ensembl_75, 
                 filters = "ensembl_gene_id", 
                 values = gene_ids, 
                 attributes = bm_attrs)

# Remove duplicates
dups <- with(bm, ensembl_gene_id[duplicated(ensembl_gene_id)])
bm <- bm[!bm$ensembl_gene_id %in% dups, ]

genes_df <- 
  tibble::tibble(ensembl_gene_id = gene_ids) %>%
  dplyr::left_join(bm, by = "ensembl_gene_id") %>%
  magrittr::set_colnames(c("gene_id", "symbol", "entrez")) %>%
  dplyr::mutate(symbol = ifelse(symbol == "", NA_character_, symbol),
                gene = ifelse(is.na(symbol), gene_id, symbol))

genes <- genes_df$gene

# Ensure no NAs
testthat::expect_false(any(is.na(genes)))
testthat::expect_equal(length(gene_ids), length(genes))

N.B. There are 1259 gene IDs that were in the expression matrix but not found by BioMart.

Gene lists

Wright et al. genes

Input files: /Users/brunogrande/Repos/dlbcl_coo_classification/reference/wright_genes.txt

How it was generated: This file contains a list of Ensembl gene IDs reported by Wright et al. to define COO status. It was compiled by Ryan.

wright_genes <- 
  readr::read_lines(wright_genes_path) %>% 
  intersect(gene_ids)

# Ensure no NAs
testthat::expect_false(any(is.na(wright_genes)))

N.B. There are 148 genes in the Wright gene list.

Lymph2Cx genes

Input file: /Users/brunogrande/Repos/dlbcl_coo_classification/reference/lymph2cx_genes.txt

How it was generated: This files contains the Ensembl gene IDs and symbols for the 20 genes included in the Lymph2Cx NanoString assay. It was manually generated by obtaining the list of gene symbols from Scott et al. (2014) and obtaining the Ensembl gene IDs from the GRCh37 Ensembl BioMart (release 75).

lymph2cx_genes <- 
  readr::read_tsv(lymph2cx_genes_path) %>%
  .$ens_id %>%
  intersect(gene_ids)

# Ensure no NAs
testthat::expect_false(any(is.na(lymph2cx_genes)))

N.B. There are 20 genes in the Lymph2Cx gene list.